From fd5b75b9646c94016d5b04be6a6df9f468aa4d03 Mon Sep 17 00:00:00 2001 From: Taku Kudo Date: Wed, 8 Jun 2022 02:22:21 +0900 Subject: [PATCH] update python wrapper. Signed-off-by: Kentaro Hayashi Gbp-Pq: Name 0001-update-python-wrapper.patch --- python/make_py_wheel.sh | 73 - python/make_py_wheel_mac.sh | 89 - python/once.h | 157 -- python/src/sentencepiece/__init__.py | 293 +- python/src/sentencepiece/sentencepiece.i | 648 ++++- .../src/sentencepiece/sentencepiece_wrap.cxx | 2383 +++++++++++------ python/test/sentencepiece_test.py | 424 +-- 7 files changed, 2575 insertions(+), 1492 deletions(-) delete mode 100755 python/make_py_wheel.sh delete mode 100755 python/make_py_wheel_mac.sh delete mode 100644 python/once.h diff --git a/python/make_py_wheel.sh b/python/make_py_wheel.sh deleted file mode 100755 index 2e123ce..0000000 --- a/python/make_py_wheel.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -# Copyright 2018 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! -set -e # exit immediately on error -set -x # display all commands - -CMAKE_VERSION=3.12.0 - -run_docker() { - cd `dirname $0` - docker pull $1 - docker run --rm -ti --name py_sentencepiece \ - -v `pwd`/../:/sentencepiece -w /sentencepiece/python \ - -td $1 /bin/bash - docker exec py_sentencepiece bash -c "./make_py_wheel.sh native $2" - docker stop py_sentencepiece -} - -build() { - TRG=$1 - rm -fr build - mkdir -p build - cd build - - # Install sentencepiece - cmake ../.. -DSPM_ENABLE_SHARED=OFF - make -j4 - make install - cd .. - - for i in /opt/python/* - do - export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib - $i/bin/python setup.py clean - $i/bin/python setup.py bdist - strip build/*/*/*.so - $i/bin/python setup.py bdist_wheel - $i/bin/python setup.py test - rm -fr build - rm -fr *.so - done - - cd dist - for i in *${TRG}.whl - do - auditwheel repair $i - done - - mv -f wheelhouse/*${TRG}.whl . - - cd .. - rm -fr build -} - -if [ "$1" = "native" ]; then - build $2 -elif [ "$#" -eq 1 ]; then - run_docker quay.io/pypa/manylinux2014_${1} ${1} -else - run_docker quay.io/pypa/manylinux2014_i686 i686 - run_docker quay.io/pypa/manylinux2014_x86_64 x86_64 -fi diff --git a/python/make_py_wheel_mac.sh b/python/make_py_wheel_mac.sh deleted file mode 100755 index bed7366..0000000 --- a/python/make_py_wheel_mac.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# Copyright 2018 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! - -set -e # exit immediately on error -set -x # display all commands - -build_python() { - VERSION=$1 - URL=$2 - INSTALL_PATH="/Library/Frameworks/Python.framework/Versions/${VERSION}/bin" - CURRENT_PATH=${PATH} - - curl -L -o python.pkg ${URL} - sudo installer -pkg python.pkg -target / - - if [ -f "${INSTALL_PATH}/python3" ]; then - ln -s ${INSTALL_PATH}/python3 ${INSTALL_PATH}/python - ln -s ${INSTALL_PATH}/python3-config ${INSTALL_PATH}/python-config - ln -s ${INSTALL_PATH}/pip3 ${INSTALL_PATH}/pip - fi - - export PATH="${INSTALL_PATH}:${CURRENT_PATH}" - ls -l ${INSTALL_PATH} - which python - which pip - python --version - curl -L -o get-pip.py https://bootstrap.pypa.io/pip/3.6/get-pip.py - sudo python ./get-pip.py --no-setuptools --no-wheel --ignore-installed - pip install --upgrade setuptools - pip install wheel - pip install delocate - python setup.py clean - python setup.py bdist_wheel --plat-name=macosx_10_6_x86_64 - python setup.py test - delocate-listdeps dist/*.whl - delocate-wheel -w dist/delocated_wheel dist/*.whl - export PATH="${CURRENT_PATH}" - - ls -l dist/delocated_wheel - rm -fr build - rm -fr *.so - rm -fr dist/*.whl - rm -fr python.pkg -} - -build() { - cd python - rm -fr build - mkdir -p build - cd build - - # Install sentencepiece - cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_NO_THREADLOCAL=ON - make -j4 VERBOSE=1 - make install - cd .. - - mkdir -p dist/delocated_wheel - -# build_python 2.7 https://www.python.org/ftp/python/2.7.15/python-2.7.15-macosx10.6.pkg -# latest pip doesn't support Py3.4 - # build_python 3.4 https://www.python.org/ftp/python/3.4.4/python-3.4.4-macosx10.6.pkg - curl -L -O https://bootstrap.pypa.io/pip/3.5/get-pip.py - build_python 3.5 https://www.python.org/ftp/python/3.5.4/python-3.5.4-macosx10.6.pkg - - curl -L -O https://bootstrap.pypa.io/get-pip.py - build_python 3.6 https://www.python.org/ftp/python/3.6.6/python-3.6.6-macosx10.6.pkg - build_python 3.7 https://www.python.org/ftp/python/3.7.9/python-3.7.9-macosx10.9.pkg - build_python 3.8 https://www.python.org/ftp/python/3.8.6/python-3.8.6-macosx10.9.pkg - build_python 3.9 https://www.python.org/ftp/python/3.9.0/python-3.9.0-macosx10.9.pkg - - cd .. - - rm -fr build -} - -build diff --git a/python/once.h b/python/once.h deleted file mode 100644 index fc7553a..0000000 --- a/python/once.h +++ /dev/null @@ -1,157 +0,0 @@ -// Protocol Buffers - Google's data interchange format -// Copyright 2008 Google Inc. All rights reserved. -// https://developers.google.com/protocol-buffers/ -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// Author: kenton@google.com (Kenton Varda) -// -// emulates google3/base/once.h -// -// This header is intended to be included only by internal .cc files and -// generated .pb.cc files. Users should not use this directly. -// -// This is basically a portable version of pthread_once(). -// -// This header declares: -// * A type called ProtobufOnceType. -// * A macro GOOGLE_PROTOBUF_DECLARE_ONCE() which declares a variable of type -// ProtobufOnceType. This is the only legal way to declare such a variable. -// The macro may only be used at the global scope (you cannot create local or -// class member variables of this type). -// * A function GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()). -// This function, when invoked multiple times given the same ProtobufOnceType -// object, will invoke init_func on the first call only, and will make sure -// none of the calls return before that first call to init_func has finished. -// * The user can provide a parameter which GoogleOnceInit() forwards to the -// user-provided function when it is called. Usage example: -// int a = 10; -// GoogleOnceInit(&my_once, &MyFunctionExpectingIntArgument, &a); -// * This implementation guarantees that ProtobufOnceType is a POD (i.e. no -// static initializer generated). -// -// This implements a way to perform lazy initialization. It's more efficient -// than using mutexes as no lock is needed if initialization has already -// happened. -// -// Example usage: -// void Init(); -// GOOGLE_PROTOBUF_DECLARE_ONCE(once_init); -// -// // Calls Init() exactly once. -// void InitOnce() { -// GoogleOnceInit(&once_init, &Init); -// } -// -// Note that if GoogleOnceInit() is called before main() has begun, it must -// only be called by the thread that will eventually call main() -- that is, -// the thread that performs dynamic initialization. In general this is a safe -// assumption since people don't usually construct threads before main() starts, -// but it is technically not guaranteed. Unfortunately, Win32 provides no way -// whatsoever to statically-initialize its synchronization primitives, so our -// only choice is to assume that dynamic initialization is single-threaded. - -#ifndef GOOGLE_PROTOBUF_STUBS_ONCE_H__ -#define GOOGLE_PROTOBUF_STUBS_ONCE_H__ - -#include -#include -#include -#include - -namespace google { -namespace protobuf { -namespace internal { - -using once_flag = std::atomic; - -template -void my_call_once(once_flag& once, Callable&& fn, Args&&... args) { - enum CallOnceState { - ONCE_INIT = 0, - ONCE_RUNNING = 1, - ONCE_DONE = 2, - }; - - int expected_state = ONCE_INIT; - if (once.compare_exchange_strong(expected_state, ONCE_RUNNING)) { - fn(std::forward(args)...); - once.store(ONCE_DONE); - return; - } - - if (expected_state == ONCE_DONE) { - return; - } - - while (once.load() == ONCE_RUNNING) { - sched_yield(); - } -} - -template -void call_once(Args&&... args) { - my_call_once(std::forward(args)...); -} -} // namespace internal - -// TODO(gerbens) remove this once third_party is fully extracted -using ProtobufOnceType = internal::once_flag; - -inline void GoogleOnceInit(ProtobufOnceType* once, void (*init_func)()) { - internal::my_call_once(*once, init_func); -} - -template -inline void GoogleOnceInitArg(ProtobufOnceType* once, void (*init_func)(Arg*), - Arg* arg) { - internal::my_call_once(*once, init_func, arg); -} - -class GoogleOnceDynamic { - public: - // If this->Init() has not been called before by any thread, - // execute (*func_with_arg)(arg) then return. - // Otherwise, wait until that prior invocation has finished - // executing its function, then return. - template - void Init(void (*func_with_arg)(T*), T* arg) { - GoogleOnceInitArg(&this->state_, func_with_arg, arg); - } - - private: - ProtobufOnceType state_; -}; - -#define GOOGLE_PROTOBUF_ONCE_TYPE ::google::protobuf::ProtobufOnceType -#define GOOGLE_PROTOBUF_DECLARE_ONCE(NAME) \ - ::google::protobuf::ProtobufOnceType NAME - -} // namespace protobuf -} // namespace google - -#endif // GOOGLE_PROTOBUF_STUBS_ONCE_H__ diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py index fdb5976..cba3b70 100644 --- a/python/src/sentencepiece/__init__.py +++ b/python/src/sentencepiece/__init__.py @@ -87,48 +87,15 @@ class SentencePieceProcessor(object): def LoadVocabulary(self, filename, threshold): return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold) - def EncodeAsPieces(self, input): - return _sentencepiece.SentencePieceProcessor_EncodeAsPieces(self, input) - - def EncodeAsIds(self, input): - return _sentencepiece.SentencePieceProcessor_EncodeAsIds(self, input) - - def NBestEncodeAsPieces(self, input, nbest_size): - return _sentencepiece.SentencePieceProcessor_NBestEncodeAsPieces(self, input, nbest_size) - - def NBestEncodeAsIds(self, input, nbest_size): - return _sentencepiece.SentencePieceProcessor_NBestEncodeAsIds(self, input, nbest_size) - - def SampleEncodeAsPieces(self, input, nbest_size, alpha): - return _sentencepiece.SentencePieceProcessor_SampleEncodeAsPieces(self, input, nbest_size, alpha) - - def SampleEncodeAsIds(self, input, nbest_size, alpha): - return _sentencepiece.SentencePieceProcessor_SampleEncodeAsIds(self, input, nbest_size, alpha) - def SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best): return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsPieces(self, input, num_samples, theta, wor, include_best) def SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best): return _sentencepiece.SentencePieceProcessor_SampleEncodeAndScoreAsIds(self, input, num_samples, theta, wor, include_best) - def DecodePieces(self, pieces): - return _sentencepiece.SentencePieceProcessor_DecodePieces(self, pieces) - def CalculateEntropy(self, text, theta): return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, text, theta) - def EncodeAsSerializedProto(self, input): - return _sentencepiece.SentencePieceProcessor_EncodeAsSerializedProto(self, input) - - def SampleEncodeAsSerializedProto(self, input, nbest_size, alpha): - return _sentencepiece.SentencePieceProcessor_SampleEncodeAsSerializedProto(self, input, nbest_size, alpha) - - def NBestEncodeAsSerializedProto(self, input, nbest_size): - return _sentencepiece.SentencePieceProcessor_NBestEncodeAsSerializedProto(self, input, nbest_size) - - def DecodePiecesAsSerializedProto(self, pieces): - return _sentencepiece.SentencePieceProcessor_DecodePiecesAsSerializedProto(self, pieces) - def GetPieceSize(self): return _sentencepiece.SentencePieceProcessor_GetPieceSize(self) @@ -171,30 +138,69 @@ class SentencePieceProcessor(object): def LoadFromFile(self, arg): return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) - def DecodeIdsWithCheck(self, ids): - return _sentencepiece.SentencePieceProcessor_DecodeIdsWithCheck(self, ids) + def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + + def _EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + + def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + + def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + + def _EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + + def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) - def DecodeIdsAsSerializedProtoWithCheck(self, ids): - return _sentencepiece.SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck(self, ids) + def _DecodeIds(self, ids): + return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids) - def _EncodeAsIds(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse): - return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse) + def _DecodePieces(self, pieces): + return _sentencepiece.SentencePieceProcessor__DecodePieces(self, pieces) - def _EncodeAsPieces(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enabele_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + def _DecodeIdsAsSerializedProto(self, ids): + return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProto(self, ids) - def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse): - return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse) + def _DecodePiecesAsSerializedProto(self, pieces): + return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces) + + def _DecodeIdsBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads) + + def _DecodeIdsAsSerializedProtoBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads) + + def _DecodePiecesBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads) + + def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads): + return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads) + + def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) def _NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) - def _SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse) + def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + + def _SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): + return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) def _SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + def _CalculateEntropy(self, text, theta): + return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, theta) + + def _CalculateEntropyBatch(self, ins, theta, num_threads): + return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, theta, num_threads) + def Init(self, model_file=None, model_proto=None, @@ -205,7 +211,8 @@ class SentencePieceProcessor(object): emit_unk_piece=False, enable_sampling=False, nbest_size=-1, - alpha=0.1): + alpha=0.1, + num_threads=1): """Initialzie sentencepieceProcessor. Args: @@ -225,6 +232,7 @@ class SentencePieceProcessor(object): forward-filtering-and-backward-sampling algorithm. alpha: Soothing parameter for unigram sampling, and dropout probability of merge operations for BPE-dropout. + num_threads: number of threads in batch processing. """ _sentencepiece_processor_init_native(self) @@ -236,6 +244,7 @@ class SentencePieceProcessor(object): self._enable_sampling = enable_sampling self._nbest_size = nbest_size self._alpha = alpha + self._num_threads = num_threads if model_file or model_proto: self.Load(model_file=model_file, model_proto=model_proto) @@ -249,7 +258,8 @@ class SentencePieceProcessor(object): emit_unk_piece=None, enable_sampling=None, nbest_size=None, - alpha=None): + alpha=None, + num_threads=None): """Encode text input to segmented ids or tokens. Args: @@ -268,6 +278,7 @@ class SentencePieceProcessor(object): forward-filtering-and-backward-sampling algorithm. alpha: Soothing parameter for unigram sampling, and merge probability for BPE-dropout (probablity 'p' in BPE-dropout paper). + num_threads: the number of threads used in the batch processin (Default = 1). """ if out_type is None: @@ -286,6 +297,8 @@ class SentencePieceProcessor(object): nbest_size = self._nbest_size if alpha is None: alpha = self._alpha + if num_threads is None: + num_threads = self._num_threads if enable_sampling == True and (nbest_size is None or nbest_size == 0 or nbest_size == 1 or alpha is None): @@ -296,18 +309,59 @@ class SentencePieceProcessor(object): 'instead of nbest segmentations.' ) - def _encode(text): - if out_type is int: - return self._EncodeAsIds(text, enable_sampling, nbest_size, - alpha, add_bos, add_eos, reverse) - else: - return self._EncodeAsPieces(text, enable_sampling, nbest_size, - alpha, add_bos, add_eos, reverse, emit_unk_piece) + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') if type(input) is list: - return [_encode(n) for n in input] + if out_type is int: + return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type is str: + return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'proto': + return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + + if out_type is int: + return self._EncodeAsIds(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type is str: + return self._EncodeAsPieces(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'proto': + return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + + raise RuntimeError('unknown out_type={}'.format(out_type)) + return None - return _encode(input) + + def EncodeAsPieces(self, input, **kwargs): + return self.Encode(input=input, out_type=str, **kwargs) + + + def EncodeAsIds(self, input, **kwargs): + return self.Encode(input=input, out_type=int, **kwargs) + + + def EncodeAsSerializedProto(self, input, **kwargs): + return self.Encode(input=input, out_type='proto', **kwargs) + + + def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type=str, enable_sampling=True, **kwargs) + + + def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type=int, enable_sampling=True, **kwargs) + + + def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type='proto', enable_sampling=True, **kwargs) def NBestEncode(self, @@ -348,9 +402,14 @@ class SentencePieceProcessor(object): def _encode(text): if out_type is int: - return self._NBestEncodeAsIds(text, nbest_size, add_bos, add_eos, reverse) - else: - return self._NBestEncodeAsPieces(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + return self._NBestEncodeAsIds(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) + if out_type is str: + return self._NBestEncodeAsPieces(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'proto': + return self._NBestEncodeAsSerializedProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) if type(input) is list: return [_encode(n) for n in input] @@ -358,6 +417,21 @@ class SentencePieceProcessor(object): return _encode(input) + def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type=str, **kwargs) + + + def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type=int, **kwargs) + + + def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type='proto', **kwargs) + + def SampleEncodeAndScore(self, input, out_type=None, @@ -373,7 +447,7 @@ class SentencePieceProcessor(object): Args: input: input string. accepsts list of string. - out_type: output type. int or str. + out_type: output type. int or str or 'proto'. add_bos: Add to the result (Default = false) add_eos: Add to the result (Default = false) / is added after reversing (if enabled). reverse: Reverses the tokenized sequence (Default = false) @@ -413,7 +487,7 @@ class SentencePieceProcessor(object): def _encode(text): if out_type is int: return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, - add_bos, add_eos, reverse) + add_bos, add_eos, reverse, emit_unk_piece) else: return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) @@ -424,35 +498,90 @@ class SentencePieceProcessor(object): return _encode(input) - def Decode(self, input): - """Decode processed id or token sequences.""" + def Decode(self, input, out_type=str, num_threads=None): + """Decode processed id or token sequences. + + Args: + out_type: output type. str or 'proto' (Default = str) + num_threads: the number of threads used in the batch processin (Default = 1). + """ + + if num_threads is None: + num_threads = self._num_threads + + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') if not input: - return self.DecodeIds([]) - elif type(input) is int: - return self.DecodeIdsWithCheck([input]) - elif type(input) is str: - return self.DecodePieces([input]) + return '' + + if out_type is str: + if type(input) is int: + return self._DecodeIds([input]) + if type(input) is str: + return self._DecodePieces([input]) + + if type(input) is list: + if len(input) == 0 or type(input[0]) is int: + return self._DecodeIds(input) + if type(input[0]) is str: + return self._DecodePieces(input) + + if type(input[0]) is list: + if len(input[0]) == 0 or type(input[0][0]) is int: + return self._DecodeIdsBatch(input, num_threads) + if type(input[0][0]) is str: + return self._DecodePiecesBatch(input, num_threads) + + if out_type == 'proto': + if type(input) is int: + return self._DecodeIdsAsSerializedProto([input]) + if type(input) is str: + return self._DecodePiecesAsSerializedProto([input]) + + if type(input) is list: + if len(input) == 0 or type(input[0]) is int: + return self._DecodeIdsAsSerializedProto(input) + if type(input[0]) is str: + return self._DecodePiecesAsSerializedProto(input) + + if type(input[0]) is list: + if len(input[0]) == 0 or type(input[0][0]) is int: + return self._DecodeIdsAsSerializedProtoBatch(input, num_threads) + if type(input[0][0]) is str: + return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) + + + raise RuntimeError('unknown output or input type') + return None - def _decode(input): - if not input: - return self.DecodeIds([]) - if type(input[0]) is int: - return self.DecodeIdsWithCheck(input) - return self.DecodePieces(input) - if type(input[0]) is list: - return [_decode(n) for n in input] + def DecodePieces(self, input, out_type=str, **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) - return _decode(input) + def DecodeIds(self, input, out_type=str, **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) - def Entropy(self, input, theta): - """Calculate sentence entropy""" + def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def CalculateEntropy(self, input, theta, num_threads=None): + """Calculate sentence entropy""" if type(input) is list: - return [self.CalculateEntropy(n, theta) for n in input] - return self.CalculateEntropy(input, theta) + if num_threads is None: + num_threads = self._num_threads + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') + return self._CalculateEntropyBatch(input, theta, num_threads) + + return self._CalculateEntropy(input, theta) def piece_size(self): @@ -642,8 +771,6 @@ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init) SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode -SentencePieceProcessor.DecodeIds = SentencePieceProcessor.DecodeIdsWithCheck -SentencePieceProcessor.DecodeIdsAsSerializedProto = SentencePieceProcessor.DecodeIdsAsSerializedProtoWithCheck for m in [ 'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused', diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i index 21bb7cf..3a822bc 100644 --- a/python/src/sentencepiece/sentencepiece.i +++ b/python/src/sentencepiece/sentencepiece.i @@ -2,9 +2,13 @@ %include exception.i %{ +#include #include +#include #include #include +#include +#include #include #include @@ -12,6 +16,8 @@ namespace { PyObject* kUnicodeInput = reinterpret_cast(0x1); PyObject* kByteInput = reinterpret_cast(0x2); +using BytesArray = std::vector; + inline void ReleaseResultObject(PyObject *obj) { if (obj != nullptr && obj != kUnicodeInput && obj != kByteInput) { Py_XDECREF(obj); @@ -54,7 +60,7 @@ PyObject* MakePyOutputString(const std::string& output, return PyBytes_FromStringAndSize(output.data(), output.size()); } -PyObject* MakePyOutputBytes(const std::string& output) { +PyObject* MakePyOutputBytes(const sentencepiece::util::bytes& output) { return PyBytes_FromStringAndSize(output.data(), output.size()); } @@ -126,18 +132,18 @@ class PySentenceIterator : public sentencepiece::SentenceIterator { sentencepiece::util::Status status_; }; -void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, - std::vector *ids, - bool add_bos, bool add_eos, bool reverse) { +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + std::vector *ids, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { if (!add_bos && !add_eos && !reverse) return; if (reverse) std::reverse(ids->begin(), ids->end()); if (add_bos) ids->insert(ids->begin(), sp.bos_id()); if (add_eos) ids->push_back(sp.eos_id()); } -void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, - std::vector *pieces, - bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + std::vector *pieces, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { if (!add_bos && !add_eos && !reverse && !emit_unk_piece) return; if (reverse) std::reverse(pieces->begin(), pieces->end()); if (add_bos) pieces->insert(pieces->begin(), sp.IdToPiece(sp.bos_id())); @@ -152,6 +158,98 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, } } } + +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + sentencepiece::util::bytes *proto, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (add_bos || add_eos || reverse || emit_unk_piece) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kUnimplemented, + "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); + } +} + +inline void CheckIds(const std::vector &ids, int num_pieces) { + for (int id : ids) { + if (id < 0 || id >= num_pieces) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, + "piece id is out of range."); + } + } +} + +inline void CheckIds(const std::vector &ids, int num_pieces) {} + +class ThreadPool { + public: + explicit ThreadPool(size_t request_size) : + request_size_(request_size) {} + + virtual ~ThreadPool() { + for (auto &task : tasks_) { + task.join(); + } + } + + void Schedule(std::function closure) { + static constexpr size_t kMinThreadSize = 2; + if (request_size_ < kMinThreadSize) { + closure(); + } else { + tasks_.emplace_back(closure); + } + } + + private: + size_t request_size_ = 0; + std::vector tasks_; +}; + +template +inline void InitNumThreads(const std::vector &ins, int *num_threads) { + *num_threads = std::max(1, + std::min({*num_threads, + static_cast(ins.size()), 256})); +} + +#define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ + InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ + pool.Schedule([&, n]() { \ + for (size_t i = n; i < ins.size(); i += num_threads) { \ + auto out = enable_sampling ? \ + self->Sample##FuncName(ins[i], \ + nbest_size, alpha) : \ + self->FuncName(ins[i]); \ + RewriteIds(*self, &out, add_bos, add_eos, reverse, \ + emit_unk_piece); \ + outs[i] = std::move(out); \ + } \ + }); \ + } \ + } \ + return outs; + +#define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ + InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ + pool.Schedule([&, n]() { \ + for (size_t i = n; i < ins.size(); i += num_threads) { \ + CheckIds(ins[i], self->GetPieceSize()); \ + outs[i] = self->FuncName(ins[i]); \ + } \ + }); \ + } \ + } \ + return outs; + } // namespace %} @@ -171,15 +269,28 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, %ignore sentencepiece::SentencePieceText; %ignore sentencepiece::NormalizerSpec; %ignore sentencepiece::TrainerSpec; - %ignore sentencepiece::SentencePieceProcessor::status; + %ignore sentencepiece::SentencePieceProcessor::Encode; +%ignore sentencepiece::SentencePieceProcessor::EncodeAsPieces; +%ignore sentencepiece::SentencePieceProcessor::EncodeAsIds; +%ignore sentencepiece::SentencePieceProcessor::EncodeAsSerializedProto; %ignore sentencepiece::SentencePieceProcessor::SampleEncode; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsIds; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsPieces; +%ignore sentencepiece::SentencePieceProcessor::SampleEncodeAsSerializedProto; %ignore sentencepiece::SentencePieceProcessor::NBestEncode; +%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsPieces; +%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsIds; +%ignore sentencepiece::SentencePieceProcessor::NBestEncodeAsSerializedProto; %ignore sentencepiece::SentencePieceProcessor::SampleEncodeAndScore; + %ignore sentencepiece::SentencePieceProcessor::Decode; %ignore sentencepiece::SentencePieceProcessor::DecodeIds; +%ignore sentencepiece::SentencePieceProcessor::DecodePieces; +%ignore sentencepiece::SentencePieceProcessor::DecodePiecesAsSerializedProto; %ignore sentencepiece::SentencePieceProcessor::DecodeIdsAsSerializedProto; + %ignore sentencepiece::SentencePieceProcessor::model_proto; %ignore sentencepiece::SentencePieceProcessor::Load; %ignore sentencepiece::SentencePieceProcessor::LoadOrDie; @@ -200,62 +311,131 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, return $self->Load(arg); } - std::string DecodeIdsWithCheck( - const std::vector &ids) const { - const int num_pieces = $self->GetPieceSize(); - for (int id : ids) { - if (id < 0 || id >= num_pieces) { - throw sentencepiece::util::Status( - sentencepiece::util::StatusCode::kOutOfRange, - "piece id is out of range."); - } - } - return $self->DecodeIds(ids); - } - - util::bytes DecodeIdsAsSerializedProtoWithCheck( - const std::vector &ids) const { - const int num_pieces = $self->GetPieceSize(); - for (int id : ids) { - if (id < 0 || id >= num_pieces) { - throw sentencepiece::util::Status( - sentencepiece::util::StatusCode::kOutOfRange, - "piece id is out of range."); - } - } - return $self->DecodeIdsAsSerializedProto(ids); - } - + ///////////////////////////////////////////////////////////////////////////// + // EncodeAs* (Single request) std::vector _EncodeAsIds(absl::string_view text, - bool enabele_sampling, + bool enable_sampling, int nbest_size, float alpha, - bool add_bos, bool add_eos, bool reverse) { - auto ids = enabele_sampling ? + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + auto ids = enable_sampling ? $self->SampleEncodeAsIds(text, nbest_size, alpha) : $self->EncodeAsIds(text); - RewriteIds(*$self, &ids, add_bos, add_eos, reverse); + RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece); return ids; } std::vector _EncodeAsPieces(absl::string_view text, - bool enabele_sampling, + bool enable_sampling, int nbest_size, float alpha, bool add_bos, bool add_eos, bool reverse, - bool emit_unk_piece) { - auto pieces = enabele_sampling ? + bool emit_unk_piece) const { + auto pieces = enable_sampling ? $self->SampleEncodeAsPieces(text, nbest_size, alpha) : $self->EncodeAsPieces(text); - RewritePieces(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); return pieces; } + sentencepiece::util::bytes _EncodeAsSerializedProto(absl::string_view text, + bool enable_sampling, + int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + auto proto = enable_sampling ? + $self->SampleEncodeAsSerializedProto(text, nbest_size, alpha) : + $self->EncodeAsSerializedProto(text); + RewriteIds(*$self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } + + ///////////////////////////////////////////////////////////////////////////// + // EncodeAs* (Batch request) + std::vector> _EncodeAsIdsBatch( + const std::vector &ins, int num_threads, + bool enable_sampling, int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds, + absl::string_view, std::vector); + } + + std::vector> _EncodeAsPiecesBatch( + const std::vector &ins, int num_threads, + bool enable_sampling, int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsPieces, + absl::string_view, std::vector); + } + + BytesArray _EncodeAsSerializedProtoBatch( + const std::vector &ins, int num_threads, + bool enable_sampling, int nbest_size, float alpha, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsSerializedProto, + absl::string_view, + sentencepiece::util::bytes); + } + + ///////////////////////////////////////////////////////////////////////////// + // DecodeAs* (Single request) + std::string _DecodeIds(const std::vector &ids) const { + CheckIds(ids, $self->GetPieceSize()); + return $self->DecodeIds(ids); + } + + std::string _DecodePieces(const std::vector &pieces) const { + return $self->DecodePieces(pieces); + } + + sentencepiece::util::bytes _DecodeIdsAsSerializedProto( + const std::vector &ids) const { + CheckIds(ids, $self->GetPieceSize()); + return $self->DecodeIdsAsSerializedProto(ids); + } + + sentencepiece::util::bytes _DecodePiecesAsSerializedProto( + const std::vector &pieces) const { + CheckIds(pieces, $self->GetPieceSize()); + return $self->DecodePiecesAsSerializedProto(pieces); + } + + ///////////////////////////////////////////////////////////////////////////// + // DecodeAs* (Batch request) + std::vector _DecodeIdsBatch( + const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); + } + + BytesArray _DecodeIdsAsSerializedProtoBatch( + const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, + sentencepiece::util::bytes); + } + + std::vector _DecodePiecesBatch( + const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); + } + + BytesArray _DecodePiecesAsSerializedProtoBatch( + const std::vector> &ins, int num_threads) const { + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, + sentencepiece::util::bytes); + } + + //////////////////////////////////////////////////////////////////////////// + // NBestEncodeAs* (Single request) std::vector> _NBestEncodeAsIds(absl::string_view text, int nbest_size, - bool add_bos, bool add_eos, bool reverse) { + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { auto idss = $self->NBestEncodeAsIds(text, nbest_size); for (auto &ids : idss) { - RewriteIds(*$self, &ids, add_bos, add_eos, reverse); + RewriteIds(*$self, &ids, add_bos, add_eos, reverse, emit_unk_piece); } return idss; } @@ -264,40 +444,74 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, _NBestEncodeAsPieces(absl::string_view text, int nbest_size, bool add_bos, bool add_eos, bool reverse, - bool emit_unk_piece) { + bool emit_unk_piece) const { auto piecess = $self->NBestEncodeAsPieces(text, nbest_size); for (auto &pieces : piecess) { - RewritePieces(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + RewriteIds(*$self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); } return piecess; } + sentencepiece::util::bytes _NBestEncodeAsSerializedProto(absl::string_view text, + int nbest_size, + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { + RewriteIds(*$self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return $self->NBestEncodeAsSerializedProto(text, nbest_size); + } + + ///////////////////////////////////////////////////////////////////////////// + // SampleEncodeAndScoreAs* (Single request) std::vector, float>> _SampleEncodeAndScoreAsIds(absl::string_view text, int num_samples, float theta, bool wor, bool include_best, - bool add_bos, bool add_eos, bool reverse) { + bool add_bos, bool add_eos, bool reverse, + bool emit_unk_piece) const { auto idss = $self->SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best); for (auto &ids : idss) { - RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse); + RewriteIds(*$self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); } return idss; } - std::vector, float>> + std::vector, float>> _SampleEncodeAndScoreAsPieces(absl::string_view text, int num_samples, float theta, bool wor, bool include_best, bool add_bos, bool add_eos, bool reverse, - bool emit_unk_piece) { + bool emit_unk_piece) const { auto piecess = $self->SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best); for (auto &pieces : piecess) { - RewritePieces(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); + RewriteIds(*$self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); } return piecess; - } + } + + // Calculate Entropy + float _CalculateEntropy(absl::string_view text, float theta) { + return $self->CalculateEntropy(text, theta); + } + + std::vector _CalculateEntropyBatch(const std::vector &ins, + float theta, int num_threads) { + std::vector outs(ins.size()); + InitNumThreads(ins, &num_threads); + { + ThreadPool pool(ins.size()); + for (int n = 0; n < num_threads; ++n) { + pool.Schedule([&, n]() { + for (size_t i = n; i < ins.size(); i += num_threads) { + outs[i] = self->CalculateEntropy(ins[i], theta); + } + }); + } + } + return outs; + } %pythoncode { def Init(self, @@ -310,7 +524,8 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, emit_unk_piece=False, enable_sampling=False, nbest_size=-1, - alpha=0.1): + alpha=0.1, + num_threads=1): """Initialzie sentencepieceProcessor. Args: @@ -330,6 +545,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, forward-filtering-and-backward-sampling algorithm. alpha: Soothing parameter for unigram sampling, and dropout probability of merge operations for BPE-dropout. + num_threads: number of threads in batch processing. """ _sentencepiece_processor_init_native(self) @@ -341,6 +557,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, self._enable_sampling = enable_sampling self._nbest_size = nbest_size self._alpha = alpha + self._num_threads = num_threads if model_file or model_proto: self.Load(model_file=model_file, model_proto=model_proto) @@ -354,7 +571,8 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, emit_unk_piece=None, enable_sampling=None, nbest_size=None, - alpha=None): + alpha=None, + num_threads=None): """Encode text input to segmented ids or tokens. Args: @@ -373,6 +591,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, forward-filtering-and-backward-sampling algorithm. alpha: Soothing parameter for unigram sampling, and merge probability for BPE-dropout (probablity 'p' in BPE-dropout paper). + num_threads: the number of threads used in the batch processin (Default = 1). """ if out_type is None: @@ -391,6 +610,8 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, nbest_size = self._nbest_size if alpha is None: alpha = self._alpha + if num_threads is None: + num_threads = self._num_threads if enable_sampling == True and (nbest_size is None or nbest_size == 0 or nbest_size == 1 or alpha is None): @@ -401,18 +622,59 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, 'instead of nbest segmentations.' ) - def _encode(text): - if out_type is int: - return self._EncodeAsIds(text, enable_sampling, nbest_size, - alpha, add_bos, add_eos, reverse) - else: - return self._EncodeAsPieces(text, enable_sampling, nbest_size, - alpha, add_bos, add_eos, reverse, emit_unk_piece) + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') if type(input) is list: - return [_encode(n) for n in input] + if out_type is int: + return self._EncodeAsIdsBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type is str: + return self._EncodeAsPiecesBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'proto': + return self._EncodeAsSerializedProtoBatch(input, num_threads, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + + if out_type is int: + return self._EncodeAsIds(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type is str: + return self._EncodeAsPieces(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'proto': + return self._EncodeAsSerializedProto(input, enable_sampling, nbest_size, + alpha, add_bos, add_eos, reverse, emit_unk_piece) + + raise RuntimeError('unknown out_type={}'.format(out_type)) + return None - return _encode(input) + + def EncodeAsPieces(self, input, **kwargs): + return self.Encode(input=input, out_type=str, **kwargs) + + + def EncodeAsIds(self, input, **kwargs): + return self.Encode(input=input, out_type=int, **kwargs) + + + def EncodeAsSerializedProto(self, input, **kwargs): + return self.Encode(input=input, out_type='proto', **kwargs) + + + def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type=str, enable_sampling=True, **kwargs) + + + def SampleEncodeAsIds(self, input, nbest_size=None, alpha=None,**kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type=int, enable_sampling=True, **kwargs) + + + def SampleEncodeAsSerializedProto(self, input, nbest_size=None, alpha=None, **kwargs): + return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha, + out_type='proto', enable_sampling=True, **kwargs) def NBestEncode(self, @@ -453,9 +715,14 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, def _encode(text): if out_type is int: - return self._NBestEncodeAsIds(text, nbest_size, add_bos, add_eos, reverse) - else: - return self._NBestEncodeAsPieces(text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + return self._NBestEncodeAsIds(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) + if out_type is str: + return self._NBestEncodeAsPieces(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) + if out_type == 'proto': + return self._NBestEncodeAsSerializedProto(text, nbest_size, + add_bos, add_eos, reverse, emit_unk_piece) if type(input) is list: return [_encode(n) for n in input] @@ -463,6 +730,21 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, return _encode(input) + def NBestEncodeAsPieces(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type=str, **kwargs) + + + def NBestEncodeAsIds(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type=int, **kwargs) + + + def NBestEncodeAsSerializedProto(self, input, nbest_size=None, **kwargs): + return self.NBestEncode(input=input, nbest_size=nbest_size, + out_type='proto', **kwargs) + + def SampleEncodeAndScore(self, input, out_type=None, @@ -478,7 +760,7 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, Args: input: input string. accepsts list of string. - out_type: output type. int or str. + out_type: output type. int or str or 'proto'. add_bos: Add to the result (Default = false) add_eos: Add to the result (Default = false) / is added after reversing (if enabled). reverse: Reverses the tokenized sequence (Default = false) @@ -513,12 +795,12 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, if include_best and not wor: raise RuntimeError('When include_best is True, We must specify "wor = True".') - + def _encode(text): if out_type is int: return self._SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best, - add_bos, add_eos, reverse) + add_bos, add_eos, reverse, emit_unk_piece) else: return self._SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) @@ -529,35 +811,90 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, return _encode(input) - def Decode(self, input): - """Decode processed id or token sequences.""" + def Decode(self, input, out_type=str, num_threads=None): + """Decode processed id or token sequences. + + Args: + out_type: output type. str or 'proto' (Default = str) + num_threads: the number of threads used in the batch processin (Default = 1). + """ + + if num_threads is None: + num_threads = self._num_threads + + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') if not input: - return self.DecodeIds([]) - elif type(input) is int: - return self.DecodeIdsWithCheck([input]) - elif type(input) is str: - return self.DecodePieces([input]) + return '' + + if out_type is str: + if type(input) is int: + return self._DecodeIds([input]) + if type(input) is str: + return self._DecodePieces([input]) + + if type(input) is list: + if len(input) == 0 or type(input[0]) is int: + return self._DecodeIds(input) + if type(input[0]) is str: + return self._DecodePieces(input) + + if type(input[0]) is list: + if len(input[0]) == 0 or type(input[0][0]) is int: + return self._DecodeIdsBatch(input, num_threads) + if type(input[0][0]) is str: + return self._DecodePiecesBatch(input, num_threads) + + if out_type == 'proto': + if type(input) is int: + return self._DecodeIdsAsSerializedProto([input]) + if type(input) is str: + return self._DecodePiecesAsSerializedProto([input]) + + if type(input) is list: + if len(input) == 0 or type(input[0]) is int: + return self._DecodeIdsAsSerializedProto(input) + if type(input[0]) is str: + return self._DecodePiecesAsSerializedProto(input) + + if type(input[0]) is list: + if len(input[0]) == 0 or type(input[0][0]) is int: + return self._DecodeIdsAsSerializedProtoBatch(input, num_threads) + if type(input[0][0]) is str: + return self._DecodePiecesAsSerializedProtoBatch(input, num_threads) + + + raise RuntimeError('unknown output or input type') + return None - def _decode(input): - if not input: - return self.DecodeIds([]) - if type(input[0]) is int: - return self.DecodeIdsWithCheck(input) - return self.DecodePieces(input) - if type(input[0]) is list: - return [_decode(n) for n in input] + def DecodePieces(self, input, out_type=str, **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) - return _decode(input) + def DecodeIds(self, input, out_type=str, **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def DecodePiecesAsSerializedProto(self, input, out_type='proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) - def Entropy(self, input, theta): - """Calculate sentence entropy""" + def DecodeIdsAsSerializedProto(self, input, out_type='proto', **kwargs): + return self.Decode(input=input, out_type=out_type, **kwargs) + + + def CalculateEntropy(self, input, theta, num_threads=None): + """Calculate sentence entropy""" if type(input) is list: - return [self.CalculateEntropy(n, theta) for n in input] - return self.CalculateEntropy(input, theta) + if num_threads is None: + num_threads = self._num_threads + if num_threads is None or type(num_threads) is not int: + raise RuntimeError('num_threads must be int') + return self._CalculateEntropyBatch(input, theta, num_threads) + + return self._CalculateEntropy(input, theta) def piece_size(self): @@ -696,6 +1033,13 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, } } +%typemap(out) std::vector { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { + PyList_SetItem($result, i, PyFloat_FromDouble(static_cast($1[i]))); + } +} + %typemap(out) std::vector> { $result = PyList_New($1.size()); for (size_t i = 0; i < $1.size(); ++i) { @@ -715,6 +1059,13 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, } } +%typemap(out) BytesArray { + $result = PyList_New($1.size()); + for (size_t i = 0; i < $1.size(); ++i) { + PyList_SetItem($result, i, MakePyOutputBytes($1[i])); + } +} + %typemap(out) std::vector> { PyObject *input_type = resultobj; $result = PyList_New($1.size()); @@ -778,7 +1129,51 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem($input, i)); if (ustring.IsAvalable()) { - (*out)[i] = std::string(ustring.data(), ustring.size()); + (*out)[i].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + $1 = out; +} + +%typemap(in) const std::vector& { + std::vector *out = nullptr; + if (PyList_Check($input)) { + const size_t size = PyList_Size($input); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem($input, i)); + if (ustring.IsAvalable()) { + (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + $1 = out; +} + +%typemap(in) const std::vector& { + std::vector *out = nullptr; + if (PyList_Check($input)) { + const size_t size = PyList_Size($input); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem($input, i)); + if (ustring.IsAvalable()) { + (*out)[i] = absl::string_view(ustring.data(), ustring.size()); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -813,6 +1208,69 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, $1 = out; } +%typemap(in) const std::vector>& { + std::vector> *out = nullptr; + if (PyList_Check($input)) { + const size_t size = PyList_Size($input); + out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem($input, i); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { + (*out)[i][j].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + $1 = out; +} + +%typemap(in) const std::vector>& { + std::vector> *out = nullptr; + if (PyList_Check($input)) { + const size_t size = PyList_Size($input); + out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem($input, i); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + PyObject *o2 = PyList_GetItem(o, j); + if (PyInt_Check(o2)) { + (*out)[i][j] = static_cast(PyInt_AsLong(o2)); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + $1 = out; +} + %typemap(in) const std::unordered_map & { std::unordered_map *out = nullptr; if (PyDict_Check($input)) { @@ -880,6 +1338,10 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, delete $1; } +%typemap(freearg) const std::vector& { + delete $1; +} + %typemap(freearg) const std::vector>& { delete $1; } @@ -888,6 +1350,10 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, delete $1; } +%typemap(freearg) const std::vector& { + delete $1; +} + %typemap(freearg) const std::vector>& { delete $1; } @@ -948,8 +1414,6 @@ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init) SentencePieceProcessor.Tokenize = SentencePieceProcessor.Encode SentencePieceProcessor.Detokenize = SentencePieceProcessor.Decode -SentencePieceProcessor.DecodeIds = SentencePieceProcessor.DecodeIdsWithCheck -SentencePieceProcessor.DecodeIdsAsSerializedProto = SentencePieceProcessor.DecodeIdsAsSerializedProtoWithCheck for m in [ 'PieceToId', 'IdToPiece', 'GetScore', 'IsUnknown', 'IsControl', 'IsUnused', diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx index 36b3a0e..6df3880 100644 --- a/python/src/sentencepiece/sentencepiece_wrap.cxx +++ b/python/src/sentencepiece/sentencepiece_wrap.cxx @@ -2698,10 +2698,13 @@ SWIGINTERN PyObject *SWIG_PyStaticMethod_New(PyObject *SWIGUNUSEDPARM(self), PyO #define SWIGTYPE_p_sentencepiece__SentencePieceTrainer swig_types[3] #define SWIGTYPE_p_std__string swig_types[4] #define SWIGTYPE_p_std__unordered_mapT_std__string_std__string_t swig_types[5] -#define SWIGTYPE_p_std__vectorT_int_t swig_types[6] -#define SWIGTYPE_p_std__vectorT_std__string_t swig_types[7] -static swig_type_info *swig_types[9]; -static swig_module_info swig_module = {swig_types, 8, 0, 0, 0, 0}; +#define SWIGTYPE_p_std__vectorT_absl__string_view_t swig_types[6] +#define SWIGTYPE_p_std__vectorT_int_t swig_types[7] +#define SWIGTYPE_p_std__vectorT_std__string_t swig_types[8] +#define SWIGTYPE_p_std__vectorT_std__vectorT_int_t_t swig_types[9] +#define SWIGTYPE_p_std__vectorT_std__vectorT_std__string_t_t swig_types[10] +static swig_type_info *swig_types[12]; +static swig_module_info swig_module = {swig_types, 11, 0, 0, 0, 0}; #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name) #define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name) @@ -2805,9 +2808,13 @@ namespace swig { } +#include #include +#include #include #include +#include +#include #include #include @@ -2815,6 +2822,8 @@ namespace { PyObject* kUnicodeInput = reinterpret_cast(0x1); PyObject* kByteInput = reinterpret_cast(0x2); +using BytesArray = std::vector; + inline void ReleaseResultObject(PyObject *obj) { if (obj != nullptr && obj != kUnicodeInput && obj != kByteInput) { Py_XDECREF(obj); @@ -2857,7 +2866,7 @@ PyObject* MakePyOutputString(const std::string& output, return PyBytes_FromStringAndSize(output.data(), output.size()); } -PyObject* MakePyOutputBytes(const std::string& output) { +PyObject* MakePyOutputBytes(const sentencepiece::util::bytes& output) { return PyBytes_FromStringAndSize(output.data(), output.size()); } @@ -2929,18 +2938,18 @@ class PySentenceIterator : public sentencepiece::SentenceIterator { sentencepiece::util::Status status_; }; -void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, - std::vector *ids, - bool add_bos, bool add_eos, bool reverse) { +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + std::vector *ids, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { if (!add_bos && !add_eos && !reverse) return; if (reverse) std::reverse(ids->begin(), ids->end()); if (add_bos) ids->insert(ids->begin(), sp.bos_id()); if (add_eos) ids->push_back(sp.eos_id()); } -void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, - std::vector *pieces, - bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + std::vector *pieces, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { if (!add_bos && !add_eos && !reverse && !emit_unk_piece) return; if (reverse) std::reverse(pieces->begin(), pieces->end()); if (add_bos) pieces->insert(pieces->begin(), sp.IdToPiece(sp.bos_id())); @@ -2955,6 +2964,98 @@ void RewritePieces(const sentencepiece::SentencePieceProcessor &sp, } } } + +inline void RewriteIds(const sentencepiece::SentencePieceProcessor &sp, + sentencepiece::util::bytes *proto, + bool add_bos, bool add_eos, bool reverse, bool emit_unk_piece) { + if (add_bos || add_eos || reverse || emit_unk_piece) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kUnimplemented, + "add_bos, add_eos, reverse, and emit_unk_piece is not supported in AsSerialize API"); + } +} + +inline void CheckIds(const std::vector &ids, int num_pieces) { + for (int id : ids) { + if (id < 0 || id >= num_pieces) { + throw sentencepiece::util::Status( + sentencepiece::util::StatusCode::kOutOfRange, + "piece id is out of range."); + } + } +} + +inline void CheckIds(const std::vector &ids, int num_pieces) {} + +class ThreadPool { + public: + explicit ThreadPool(size_t request_size) : + request_size_(request_size) {} + + virtual ~ThreadPool() { + for (auto &task : tasks_) { + task.join(); + } + } + + void Schedule(std::function closure) { + static constexpr size_t kMinThreadSize = 2; + if (request_size_ < kMinThreadSize) { + closure(); + } else { + tasks_.emplace_back(closure); + } + } + + private: + size_t request_size_ = 0; + std::vector tasks_; +}; + +template +inline void InitNumThreads(const std::vector &ins, int *num_threads) { + *num_threads = std::max(1, + std::min({*num_threads, + static_cast(ins.size()), 256})); +} + +#define DEFINE_ENCODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ + InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ + pool.Schedule([&, n]() { \ + for (size_t i = n; i < ins.size(); i += num_threads) { \ + auto out = enable_sampling ? \ + self->Sample##FuncName(ins[i], \ + nbest_size, alpha) : \ + self->FuncName(ins[i]); \ + RewriteIds(*self, &out, add_bos, add_eos, reverse, \ + emit_unk_piece); \ + outs[i] = std::move(out); \ + } \ + }); \ + } \ + } \ + return outs; + +#define DEFINE_DECODE_BATCH_FUNC_IMPL(FuncName, InType, OutType) \ + std::vector outs(ins.size()); \ + InitNumThreads(ins, &num_threads); \ + { \ + ThreadPool pool(ins.size()); \ + for (int n = 0; n < num_threads; ++n) { \ + pool.Schedule([&, n]() { \ + for (size_t i = n; i < ins.size(); i += num_threads) { \ + CheckIds(ins[i], self->GetPieceSize()); \ + outs[i] = self->FuncName(ins[i]); \ + } \ + }); \ + } \ + } \ + return outs; + } // namespace @@ -3334,72 +3435,122 @@ SWIGINTERNINLINE PyObject* SWIGINTERN sentencepiece::util::Status sentencepiece_SentencePieceProcessor_LoadFromFile(sentencepiece::SentencePieceProcessor *self,absl::string_view arg){ return self->Load(arg); } -SWIGINTERN std::string sentencepiece_SentencePieceProcessor_DecodeIdsWithCheck(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ - const int num_pieces = self->GetPieceSize(); - for (int id : ids) { - if (id < 0 || id >= num_pieces) { - throw sentencepiece::util::Status( - sentencepiece::util::StatusCode::kOutOfRange, - "piece id is out of range."); - } - } - return self->DecodeIds(ids); - } -SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ - const int num_pieces = self->GetPieceSize(); - for (int id : ids) { - if (id < 0 || id >= num_pieces) { - throw sentencepiece::util::Status( - sentencepiece::util::StatusCode::kOutOfRange, - "piece id is out of range."); - } - } - return self->DecodeIdsAsSerializedProto(ids); - } -SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor *self,absl::string_view text,bool enabele_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse){ - auto ids = enabele_sampling ? +SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto ids = enable_sampling ? self->SampleEncodeAsIds(text, nbest_size, alpha) : self->EncodeAsIds(text); - RewriteIds(*self, &ids, add_bos, add_eos, reverse); + RewriteIds(*self, &ids, add_bos, add_eos, reverse, emit_unk_piece); return ids; } -SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__EncodeAsPieces(sentencepiece::SentencePieceProcessor *self,absl::string_view text,bool enabele_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ - auto pieces = enabele_sampling ? +SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__EncodeAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto pieces = enable_sampling ? self->SampleEncodeAsPieces(text, nbest_size, alpha) : self->EncodeAsPieces(text); - RewritePieces(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + RewriteIds(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); return pieces; } -SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse){ +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__EncodeAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + auto proto = enable_sampling ? + self->SampleEncodeAsSerializedProto(text, nbest_size, alpha) : + self->EncodeAsSerializedProto(text); + RewriteIds(*self, &proto, add_bos, add_eos, reverse, emit_unk_piece); + return proto; + } +SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsIds, + absl::string_view, std::vector); + } +SWIGINTERN std::vector< std::vector< std::string > > sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsPieces, + absl::string_view, std::vector); + } +SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< absl::string_view > const &ins,int num_threads,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + DEFINE_ENCODE_BATCH_FUNC_IMPL(EncodeAsSerializedProto, + absl::string_view, + sentencepiece::util::bytes); + } +SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodeIds(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ + CheckIds(ids, self->GetPieceSize()); + return self->DecodeIds(ids); + } +SWIGINTERN std::string sentencepiece_SentencePieceProcessor__DecodePieces(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ + return self->DecodePieces(pieces); + } +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< int > const &ids){ + CheckIds(ids, self->GetPieceSize()); + return self->DecodeIdsAsSerializedProto(ids); + } +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,std::vector< std::string > const &pieces){ + CheckIds(pieces, self->GetPieceSize()); + return self->DecodePiecesAsSerializedProto(pieces); + } +SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodeIdsBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIds, int, std::string); + } +SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< int > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodeIdsAsSerializedProto, int, + sentencepiece::util::bytes); + } +SWIGINTERN std::vector< std::string > sentencepiece_SentencePieceProcessor__DecodePiecesBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePieces, std::string, std::string); + } +SWIGINTERN BytesArray sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(sentencepiece::SentencePieceProcessor const *self,std::vector< std::vector< std::string > > const &ins,int num_threads){ + DEFINE_DECODE_BATCH_FUNC_IMPL(DecodePiecesAsSerializedProto, std::string, + sentencepiece::util::bytes); + } +SWIGINTERN std::vector< std::vector< int > > sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto idss = self->NBestEncodeAsIds(text, nbest_size); for (auto &ids : idss) { - RewriteIds(*self, &ids, add_bos, add_eos, reverse); + RewriteIds(*self, &ids, add_bos, add_eos, reverse, emit_unk_piece); } return idss; } -SWIGINTERN std::vector< std::vector< std::string > > sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ +SWIGINTERN std::vector< std::vector< std::string > > sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto piecess = self->NBestEncodeAsPieces(text, nbest_size); for (auto &pieces : piecess) { - RewritePieces(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); + RewriteIds(*self, &pieces, add_bos, add_eos, reverse, emit_unk_piece); } return piecess; } -SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse){ +SWIGINTERN sentencepiece::util::bytes sentencepiece_SentencePieceProcessor__NBestEncodeAsSerializedProto(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int nbest_size,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ + RewriteIds(*self, static_cast(nullptr), + add_bos, add_eos, reverse, emit_unk_piece); + return self->NBestEncodeAsSerializedProto(text, nbest_size); + } +SWIGINTERN std::vector< std::pair< std::vector< int >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto idss = self->SampleEncodeAndScoreAsIds(text, num_samples, theta, wor, include_best); for (auto &ids : idss) { - RewriteIds(*self, &ids.first, add_bos, add_eos, reverse); + RewriteIds(*self, &ids.first, add_bos, add_eos, reverse, emit_unk_piece); } return idss; } -SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ +SWIGINTERN std::vector< std::pair< std::vector< std::string >,float > > sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,int num_samples,float theta,bool wor,bool include_best,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){ auto piecess = self->SampleEncodeAndScoreAsPieces(text, num_samples, theta, wor, include_best); for (auto &pieces : piecess) { - RewritePieces(*self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); + RewriteIds(*self, &pieces.first, add_bos, add_eos, reverse, emit_unk_piece); } return piecess; } +SWIGINTERN float sentencepiece_SentencePieceProcessor__CalculateEntropy(sentencepiece::SentencePieceProcessor *self,absl::string_view text,float theta){ + return self->CalculateEntropy(text, theta); + } +SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(sentencepiece::SentencePieceProcessor *self,std::vector< absl::string_view > const &ins,float theta,int num_threads){ + std::vector outs(ins.size()); + InitNumThreads(ins, &num_threads); + { + ThreadPool pool(ins.size()); + for (int n = 0; n < num_threads; ++n) { + pool.Schedule([&, n]() { + for (size_t i = n; i < ins.size(); i += num_threads) { + outs[i] = self->CalculateEntropy(ins[i], theta); + } + }); + } + } + return outs; + } SWIGINTERN int SWIG_AsVal_unsigned_SS_long (PyObject *obj, unsigned long *val) @@ -3703,7 +3854,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetVocabulary(PyObject *SWIGUN for (size_t i = 0; i < size; ++i) { const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); if (ustring.IsAvalable()) { - (*out)[i] = std::string(ustring.data(), ustring.size()); + (*out)[i].assign(ustring.data(), ustring.size()); } else { PyErr_SetString(PyExc_TypeError, "list must contain strings"); SWIG_fail; @@ -3832,19 +3983,31 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; + int arg3 ; + float arg4 ; + bool arg5 ; + bool arg6 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - std::vector< std::string > result; + int val3 ; + int ecode3 = 0 ; + float val4 ; + int ecode4 = 0 ; + bool val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + PyObject *swig_obj[6] ; + std::vector< std::pair< std::vector< std::string >,float > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_EncodeAsPieces", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", 6, 6, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_EncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -3856,9 +4019,29 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsPieces(PyObject *SWIGU resultobj = ustring.input_type(); arg2 = absl::string_view(ustring.data(), ustring.size()); } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "4"" of type '" "float""'"); + } + arg4 = static_cast< float >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->EncodeAsPieces(arg2); + result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsPieces(arg2,arg3,arg4,arg5,arg6); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -3869,7 +4052,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsPieces(PyObject *SWIGU PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + PyObject *obj = PyList_New(result[i].first.size()); + for (size_t j = 0; j < result[i].first.size(); ++j) { + PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); + } + PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); } } return resultobj; @@ -3878,19 +4065,31 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; + int arg3 ; + float arg4 ; + bool arg5 ; + bool arg6 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - std::vector< int > result; + int val3 ; + int ecode3 = 0 ; + float val4 ; + int ecode4 = 0 ; + bool val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + PyObject *swig_obj[6] ; + std::vector< std::pair< std::vector< int >,float > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_EncodeAsIds", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsIds", 6, 6, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_EncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -3902,9 +4101,29 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsIds(PyObject *SWIGUNUS resultobj = ustring.input_type(); arg2 = absl::string_view(ustring.data(), ustring.size()); } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "4"" of type '" "float""'"); + } + arg4 = static_cast< float >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "5"" of type '" "bool""'"); + } + arg5 = static_cast< bool >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->EncodeAsIds(arg2); + result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsIds(arg2,arg3,arg4,arg5,arg6); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -3914,7 +4133,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsIds(PyObject *SWIGUNUS { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); + PyObject *obj = PyList_New(result[i].first.size()); + for (size_t j = 0; j < result[i].first.size(); ++j) { + PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); + } + PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); } } return resultobj; @@ -3923,22 +4146,22 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; - int arg3 ; + float arg3 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; + float val3 ; int ecode3 = 0 ; PyObject *swig_obj[3] ; - std::vector< std::vector< std::string > > result; + float result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_NBestEncodeAsPieces", 3, 3, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_NBestEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -3950,113 +4173,71 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsPieces(PyObject * resultobj = ustring.input_type(); arg2 = absl::string_view(ustring.data(), ustring.size()); } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_NBestEncodeAsPieces" "', argument " "3"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); } - arg3 = static_cast< int >(val3); + arg3 = static_cast< float >(val3); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->NBestEncodeAsPieces(arg2,arg3); + result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - PyObject *input_type = resultobj; - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].size()); - for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); - } - PyList_SetItem(resultobj, i, obj); - } - } + resultobj = SWIG_From_float(static_cast< float >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - PyObject *swig_obj[3] ; - std::vector< std::vector< int > > result; + PyObject *swig_obj[1] ; + int result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_NBestEncodeAsIds", 3, 3, swig_obj)) SWIG_fail; + if (!args) SWIG_fail; + swig_obj[0] = args; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_NBestEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetPieceSize" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_NBestEncodeAsIds" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->NBestEncodeAsIds(arg2,arg3); + result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->GetPieceSize(); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].size()); - for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); - } - PyList_SetItem(resultobj, i, obj); - } - } + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_PieceToId(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; - int arg3 ; - float arg4 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - PyObject *swig_obj[4] ; - std::vector< std::string > result; + PyObject *swig_obj[2] ; + int result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAsPieces", 4, 4, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_PieceToId", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_PieceToId" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -4068,81 +4249,47 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsPieces(PyObject resultobj = ustring.input_type(); arg2 = absl::string_view(ustring.data(), ustring.size()); } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAsPieces" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAsPieces" "', argument " "4"" of type '" "float""'"); - } - arg4 = static_cast< float >(val4); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAsPieces(arg2,arg3,arg4); + result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->PieceToId(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - PyObject *input_type = resultobj; - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); - } - } + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IdToPiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; - float arg4 ; + int arg2 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - PyObject *swig_obj[4] ; - std::vector< int > result; + int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + std::string *result = 0 ; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAsIds", 4, 4, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IdToPiece", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAsIds" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAsIds" "', argument " "4"" of type '" "float""'"); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "2"" of type '" "int""'"); } - arg4 = static_cast< float >(val4); + arg2 = static_cast< int >(val2); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAsIds(arg2,arg3,arg4); + result = (std::string *) &((sentencepiece::SentencePieceProcessor const *)arg1)->IdToPiece(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -4150,10 +4297,8 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsIds(PyObject *SW } } { - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); - } + PyObject *input_type = resultobj; + resultobj = MakePyOutputString(*result, input_type); } return resultobj; fail: @@ -4161,489 +4306,290 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetScore(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; - float arg4 ; - bool arg5 ; - bool arg6 ; + int arg2 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - bool val5 ; - int ecode5 = 0 ; - bool val6 ; - int ecode6 = 0 ; - PyObject *swig_obj[6] ; - std::vector< std::pair< std::vector< std::string >,float > > result; + int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + float result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", 6, 6, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_GetScore", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetScore" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "4"" of type '" "float""'"); - } - arg4 = static_cast< float >(val4); - ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "5"" of type '" "bool""'"); - } - arg5 = static_cast< bool >(val5); - ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsPieces" "', argument " "6"" of type '" "bool""'"); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_GetScore" "', argument " "2"" of type '" "int""'"); } - arg6 = static_cast< bool >(val6); + arg2 = static_cast< int >(val2); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsPieces(arg2,arg3,arg4,arg5,arg6); + result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->GetScore(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - PyObject *input_type = resultobj; - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].first.size()); - for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i].first[j], input_type)); - } - PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); - } - } + resultobj = SWIG_From_float(static_cast< float >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnknown(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; - float arg4 ; - bool arg5 ; - bool arg6 ; + int arg2 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - bool val5 ; - int ecode5 = 0 ; - bool val6 ; - int ecode6 = 0 ; - PyObject *swig_obj[6] ; - std::vector< std::pair< std::vector< int >,float > > result; + int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + bool result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAndScoreAsIds", 6, 6, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnknown", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "4"" of type '" "float""'"); - } - arg4 = static_cast< float >(val4); - ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "5"" of type '" "bool""'"); - } - arg5 = static_cast< bool >(val5); - ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor_SampleEncodeAndScoreAsIds" "', argument " "6"" of type '" "bool""'"); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "2"" of type '" "int""'"); } - arg6 = static_cast< bool >(val6); + arg2 = static_cast< int >(val2); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAndScoreAsIds(arg2,arg3,arg4,arg5,arg6); + result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnknown(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].first.size()); - for (size_t j = 0; j < result[i].first.size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i].first[j]))); - } - PyList_SetItem(resultobj, i, PyTuple_Pack(2, obj, PyFloat_FromDouble(static_cast(result[i].second)))); - } - } + resultobj = SWIG_From_bool(static_cast< bool >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodePieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsControl(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::string > *arg2 = 0 ; + int arg2 ; void *argp1 = 0 ; int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; PyObject *swig_obj[2] ; - std::string result; + bool result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodePieces", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsControl", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodePieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsControl" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - std::vector *out = nullptr; - if (PyList_Check(swig_obj[1])) { - const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); - for (size_t i = 0; i < size; ++i) { - const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); - if (ustring.IsAvalable()) { - (*out)[i] = std::string(ustring.data(), ustring.size()); - } else { - PyErr_SetString(PyExc_TypeError, "list must contain strings"); - SWIG_fail; - } - resultobj = ustring.input_type(); - } - } else { - PyErr_SetString(PyExc_TypeError, "not a list"); - SWIG_fail; - } - arg2 = out; - } + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsControl" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->DecodePieces((std::vector< std::string > const &)*arg2); + result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsControl(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - PyObject *input_type = resultobj; - resultobj = MakePyOutputString(result, input_type); - } - { - delete arg2; - } + resultobj = SWIG_From_bool(static_cast< bool >(result)); return resultobj; fail: - { - delete arg2; - } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnused(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - float arg3 ; + int arg2 ; void *argp1 = 0 ; int res1 = 0 ; - float val3 ; - int ecode3 = 0 ; - PyObject *swig_obj[3] ; - float result; + int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + bool result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnused", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_CalculateEntropy" "', argument " "3"" of type '" "float""'"); + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "2"" of type '" "int""'"); } - arg3 = static_cast< float >(val3); + arg2 = static_cast< int >(val2); { try { - result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->CalculateEntropy(arg2,arg3); + result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnused(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_float(static_cast< float >(result)); + resultobj = SWIG_From_bool(static_cast< bool >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_EncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsByte(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; + int arg2 ; void *argp1 = 0 ; int res1 = 0 ; + int val2 ; + int ecode2 = 0 ; PyObject *swig_obj[2] ; - sentencepiece::util::bytes result; + bool result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_EncodeAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsByte", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_EncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsByte" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } + ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsByte" "', argument " "2"" of type '" "int""'"); + } + arg2 = static_cast< int >(val2); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->EncodeAsSerializedProto(arg2); + result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsByte(arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - resultobj = MakePyOutputBytes(result); - } + resultobj = SWIG_From_bool(static_cast< bool >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SampleEncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_unk_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; - float arg4 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - float val4 ; - int ecode4 = 0 ; - PyObject *swig_obj[4] ; - sentencepiece::util::bytes result; + PyObject *swig_obj[1] ; + int result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SampleEncodeAsSerializedProto", 4, 4, swig_obj)) SWIG_fail; + if (!args) SWIG_fail; + swig_obj[0] = args; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SampleEncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_unk_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_SampleEncodeAsSerializedProto" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); - ecode4 = SWIG_AsVal_float(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor_SampleEncodeAsSerializedProto" "', argument " "4"" of type '" "float""'"); - } - arg4 = static_cast< float >(val4); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->SampleEncodeAsSerializedProto(arg2,arg3,arg4); + result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->unk_id(); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - resultobj = MakePyOutputBytes(result); - } + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_NBestEncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_bos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - int arg3 ; void *argp1 = 0 ; int res1 = 0 ; - int val3 ; - int ecode3 = 0 ; - PyObject *swig_obj[3] ; - sentencepiece::util::bytes result; + PyObject *swig_obj[1] ; + int result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_NBestEncodeAsSerializedProto", 3, 3, swig_obj)) SWIG_fail; + if (!args) SWIG_fail; + swig_obj[0] = args; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_NBestEncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_bos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } - ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor_NBestEncodeAsSerializedProto" "', argument " "3"" of type '" "int""'"); - } - arg3 = static_cast< int >(val3); { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->NBestEncodeAsSerializedProto(arg2,arg3); + result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->bos_id(); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - resultobj = MakePyOutputBytes(result); - } + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodePiecesAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_eos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< std::string > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - sentencepiece::util::bytes result; + PyObject *swig_obj[1] ; + int result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodePiecesAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; + if (!args) SWIG_fail; + swig_obj[0] = args; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodePiecesAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_eos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - std::vector *out = nullptr; - if (PyList_Check(swig_obj[1])) { - const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); - for (size_t i = 0; i < size; ++i) { - const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); - if (ustring.IsAvalable()) { - (*out)[i] = std::string(ustring.data(), ustring.size()); - } else { - PyErr_SetString(PyExc_TypeError, "list must contain strings"); - SWIG_fail; - } - resultobj = ustring.input_type(); - } - } else { - PyErr_SetString(PyExc_TypeError, "not a list"); - SWIG_fail; - } - arg2 = out; - } { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->DecodePiecesAsSerializedProto((std::vector< std::string > const &)*arg2); + result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->eos_id(); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - { - resultobj = MakePyOutputBytes(result); - } - { - delete arg2; - } + resultobj = SWIG_From_int(static_cast< int >(result)); return resultobj; fail: - { - delete arg2; - } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_pad_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; void *argp1 = 0 ; @@ -4655,12 +4601,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetPieceSize(PyObject *SWIGUNU swig_obj[0] = args; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetPieceSize" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_pad_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { try { - result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->GetPieceSize(); + result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->pad_id(); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -4674,71 +4620,66 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_PieceToId(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_serialized_model_proto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - int result; + PyObject *swig_obj[1] ; + sentencepiece::util::bytes result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_PieceToId", 2, 2, swig_obj)) SWIG_fail; + if (!args) SWIG_fail; + swig_obj[0] = args; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_PieceToId" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_serialized_model_proto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); - } { try { - result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->PieceToId(arg2); + result = ((sentencepiece::SentencePieceProcessor const *)arg1)->serialized_model_proto(); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_int(static_cast< int >(result)); + { + resultobj = MakePyOutputBytes(result); + } return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IdToPiece(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - int arg2 ; + absl::string_view arg2 ; void *argp1 = 0 ; int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; PyObject *swig_obj[2] ; - std::string *result = 0 ; + sentencepiece::util::Status result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IdToPiece", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_LoadFromFile", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_LoadFromFile" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IdToPiece" "', argument " "2"" of type '" "int""'"); - } - arg2 = static_cast< int >(val2); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } { try { - result = (std::string *) &((sentencepiece::SentencePieceProcessor const *)arg1)->IdToPiece(arg2); + result = sentencepiece_SentencePieceProcessor_LoadFromFile(arg1,arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -4746,8 +4687,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IdToPiece(PyObject *SWIGUNUSED } } { - PyObject *input_type = resultobj; - resultobj = MakePyOutputString(*result, input_type); + if (!(&result)->ok()) { + SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); + } + resultobj = SWIG_From_bool((&result)->ok()); } return resultobj; fail: @@ -4755,338 +4698,916 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetScore(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - int arg2 ; + absl::string_view arg2 ; + bool arg3 ; + int arg4 ; + float arg5 ; + bool arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; void *argp1 = 0 ; int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject *swig_obj[2] ; - float result; + bool val3 ; + int ecode3 = 0 ; + int val4 ; + int ecode4 = 0 ; + float val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + PyObject *swig_obj[9] ; + std::vector< int > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_GetScore", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIds", 9, 9, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetScore" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_GetScore" "', argument " "2"" of type '" "int""'"); - } - arg2 = static_cast< int >(val2); { - try { - result = (float)((sentencepiece::SentencePieceProcessor const *)arg1)->GetScore(arg2); - ReleaseResultObject(resultobj); - } - catch (const sentencepiece::util::Status &status) { - SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; } + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); } - resultobj = SWIG_From_float(static_cast< float >(result)); - return resultobj; -fail: - return NULL; -} - - -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnknown(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "3"" of type '" "bool""'"); + } + arg3 = static_cast< bool >(val3); + ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "4"" of type '" "int""'"); + } + arg4 = static_cast< int >(val4); + ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "5"" of type '" "float""'"); + } + arg5 = static_cast< float >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + { + try { + result = sentencepiece_SentencePieceProcessor__EncodeAsIds((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); + } + } + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - int arg2 ; + absl::string_view arg2 ; + bool arg3 ; + int arg4 ; + float arg5 ; + bool arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; void *argp1 = 0 ; int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject *swig_obj[2] ; - bool result; + bool val3 ; + int ecode3 = 0 ; + int val4 ; + int ecode4 = 0 ; + float val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + PyObject *swig_obj[9] ; + std::vector< std::string > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnknown", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPieces", 9, 9, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnknown" "', argument " "2"" of type '" "int""'"); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "3"" of type '" "bool""'"); } - arg2 = static_cast< int >(val2); + arg3 = static_cast< bool >(val3); + ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "4"" of type '" "int""'"); + } + arg4 = static_cast< int >(val4); + ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "5"" of type '" "float""'"); + } + arg5 = static_cast< float >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); { try { - result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnknown(arg2); + result = sentencepiece_SentencePieceProcessor__EncodeAsPieces((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_bool(static_cast< bool >(result)); + { + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + } + } return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsControl(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - int arg2 ; + absl::string_view arg2 ; + bool arg3 ; + int arg4 ; + float arg5 ; + bool arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; void *argp1 = 0 ; int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject *swig_obj[2] ; - bool result; + bool val3 ; + int ecode3 = 0 ; + int val4 ; + int ecode4 = 0 ; + float val5 ; + int ecode5 = 0 ; + bool val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + PyObject *swig_obj[9] ; + sentencepiece::util::bytes result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsControl", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProto", 9, 9, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsControl" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsControl" "', argument " "2"" of type '" "int""'"); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } + ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "3"" of type '" "bool""'"); } - arg2 = static_cast< int >(val2); + arg3 = static_cast< bool >(val3); + ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "4"" of type '" "int""'"); + } + arg4 = static_cast< int >(val4); + ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "5"" of type '" "float""'"); + } + arg5 = static_cast< float >(val5); + ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "6"" of type '" "bool""'"); + } + arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProto" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); { try { - result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsControl(arg2); + result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_bool(static_cast< bool >(result)); + { + resultobj = MakePyOutputBytes(result); + } return resultobj; fail: return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsUnused(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - int arg2 ; + std::vector< absl::string_view > *arg2 = 0 ; + int arg3 ; + bool arg4 ; + int arg5 ; + float arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + bool arg10 ; void *argp1 = 0 ; int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject *swig_obj[2] ; - bool result; + int val3 ; + int ecode3 = 0 ; + bool val4 ; + int ecode4 = 0 ; + int val5 ; + int ecode5 = 0 ; + float val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; + std::vector< std::vector< int > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsUnused", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIdsBatch", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsUnused" "', argument " "2"" of type '" "int""'"); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "3"" of type '" "int""'"); } - arg2 = static_cast< int >(val2); + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsIdsBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); { try { - result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsUnused(arg2); + result = sentencepiece_SentencePieceProcessor__EncodeAsIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_bool(static_cast< bool >(result)); + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { + PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + } + PyList_SetItem(resultobj, i, obj); + } + } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_IsByte(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - int arg2 ; + std::vector< absl::string_view > *arg2 = 0 ; + int arg3 ; + bool arg4 ; + int arg5 ; + float arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + bool arg10 ; void *argp1 = 0 ; int res1 = 0 ; - int val2 ; - int ecode2 = 0 ; - PyObject *swig_obj[2] ; - bool result; + int val3 ; + int ecode3 = 0 ; + bool val4 ; + int ecode4 = 0 ; + int val5 ; + int ecode5 = 0 ; + float val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; + std::vector< std::vector< std::string > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_IsByte", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPiecesBatch", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_IsByte" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - ecode2 = SWIG_AsVal_int(swig_obj[1], &val2); - if (!SWIG_IsOK(ecode2)) { - SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_IsByte" "', argument " "2"" of type '" "int""'"); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "3"" of type '" "int""'"); } - arg2 = static_cast< int >(val2); + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsPiecesBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); { try { - result = (bool)((sentencepiece::SentencePieceProcessor const *)arg1)->IsByte(arg2); + result = sentencepiece_SentencePieceProcessor__EncodeAsPiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_bool(static_cast< bool >(result)); + { + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { + PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); + } + PyList_SetItem(resultobj, i, obj); + } + } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_unk_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; + int arg3 ; + bool arg4 ; + int arg5 ; + float arg6 ; + bool arg7 ; + bool arg8 ; + bool arg9 ; + bool arg10 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[1] ; - int result; + int val3 ; + int ecode3 = 0 ; + bool val4 ; + int ecode4 = 0 ; + int val5 ; + int ecode5 = 0 ; + float val6 ; + int ecode6 = 0 ; + bool val7 ; + int ecode7 = 0 ; + bool val8 ; + int ecode8 = 0 ; + bool val9 ; + int ecode9 = 0 ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; + BytesArray result; - if (!args) SWIG_fail; - swig_obj[0] = args; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsSerializedProtoBatch", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_unk_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "4"" of type '" "bool""'"); + } + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_int(swig_obj[4], &val5); + if (!SWIG_IsOK(ecode5)) { + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "5"" of type '" "int""'"); + } + arg5 = static_cast< int >(val5); + ecode6 = SWIG_AsVal_float(swig_obj[5], &val6); + if (!SWIG_IsOK(ecode6)) { + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "6"" of type '" "float""'"); + } + arg6 = static_cast< float >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); + ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); + if (!SWIG_IsOK(ecode8)) { + SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "8"" of type '" "bool""'"); + } + arg8 = static_cast< bool >(val8); + ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); + if (!SWIG_IsOK(ecode9)) { + SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "9"" of type '" "bool""'"); + } + arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__EncodeAsSerializedProtoBatch" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); { try { - result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->unk_id(); + result = sentencepiece_SentencePieceProcessor__EncodeAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_int(static_cast< int >(result)); + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); + } + } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_bos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< int > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[1] ; - int result; + PyObject *swig_obj[2] ; + std::string result; - if (!args) SWIG_fail; - swig_obj[0] = args; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIds", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_bos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyInt_Check(o)) { + (*out)[i] = static_cast(PyInt_AsLong(o)); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + arg2 = out; + } { try { - result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->bos_id(); + result = sentencepiece_SentencePieceProcessor__DecodeIds((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_int(static_cast< int >(result)); + { + PyObject *input_type = resultobj; + resultobj = MakePyOutputString(result, input_type); + } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_eos_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< std::string > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[1] ; - int result; + PyObject *swig_obj[2] ; + std::string result; - if (!args) SWIG_fail; - swig_obj[0] = args; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePieces", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_eos_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; } - arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { try { - result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->eos_id(); + result = sentencepiece_SentencePieceProcessor__DecodePieces((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_int(static_cast< int >(result)); + { + PyObject *input_type = resultobj; + resultobj = MakePyOutputString(result, input_type); + } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_pad_id(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< int > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[1] ; - int result; + PyObject *swig_obj[2] ; + sentencepiece::util::bytes result; - if (!args) SWIG_fail; - swig_obj[0] = args; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_pad_id" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyInt_Check(o)) { + (*out)[i] = static_cast(PyInt_AsLong(o)); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + arg2 = out; + } { try { - result = (int)((sentencepiece::SentencePieceProcessor const *)arg1)->pad_id(); + result = sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); } } - resultobj = SWIG_From_int(static_cast< int >(result)); + { + resultobj = MakePyOutputBytes(result); + } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_serialized_model_proto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< std::string > *arg2 = 0 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[1] ; + PyObject *swig_obj[2] ; sentencepiece::util::bytes result; - if (!args) SWIG_fail; - swig_obj[0] = args; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsSerializedProto", 2, 2, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_serialized_model_proto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } { try { - result = ((sentencepiece::SentencePieceProcessor const *)arg1)->serialized_model_proto(); + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::string > const &)*arg2); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5096,39 +5617,74 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_serialized_model_proto(PyObjec { resultobj = MakePyOutputBytes(result); } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; + std::vector< std::vector< int > > *arg2 = 0 ; + int arg3 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - sentencepiece::util::Status result; + int val3 ; + int ecode3 = 0 ; + PyObject *swig_obj[3] ; + std::vector< std::string > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_LoadFromFile", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsBatch", 3, 3, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_LoadFromFile" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); + std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + PyObject *o2 = PyList_GetItem(o, j); + if (PyInt_Check(o2)) { + (*out)[i][j] = static_cast(PyInt_AsLong(o2)); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); SWIG_fail; } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + arg2 = out; } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodeIdsBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor_LoadFromFile(arg1,arg2); + result = sentencepiece_SentencePieceProcessor__DecodeIdsBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< int > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5136,43 +5692,63 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadFromFile(PyObject *SWIGUNU } } { - if (!(&result)->ok()) { - SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str()); + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); } - resultobj = SWIG_From_bool((&result)->ok()); + } + { + delete arg2; } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsWithCheck(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< int > *arg2 = 0 ; + std::vector< std::vector< int > > *arg2 = 0 ; + int arg3 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - std::string result; + int val3 ; + int ecode3 = 0 ; + PyObject *swig_obj[3] ; + BytesArray result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodeIdsWithCheck", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", 3, 3, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodeIdsWithCheck" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector *out = nullptr; + std::vector> *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); + out = new std::vector>(size); for (size_t i = 0; i < size; ++i) { PyObject *o = PyList_GetItem(swig_obj[1], i); - if (PyInt_Check(o)) { - (*out)[i] = static_cast(PyInt_AsLong(o)); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + PyObject *o2 = PyList_GetItem(o, j); + if (PyInt_Check(o2)) { + (*out)[i][j] = static_cast(PyInt_AsLong(o2)); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + } } else { - PyErr_SetString(PyExc_TypeError,"list must contain integers"); + PyErr_SetString(PyExc_TypeError, "not a list"); SWIG_fail; } } @@ -5182,9 +5758,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsWithCheck(PyObject *S } arg2 = out; } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor_DecodeIdsWithCheck((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); + result = sentencepiece_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< int > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5192,8 +5773,10 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsWithCheck(PyObject *S } } { - PyObject *input_type = resultobj; - resultobj = MakePyOutputString(result, input_type); + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); + } } { delete arg2; @@ -5207,32 +5790,46 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - std::vector< int > *arg2 = 0 ; + std::vector< std::vector< std::string > > *arg2 = 0 ; + int arg3 ; void *argp1 = 0 ; int res1 = 0 ; - PyObject *swig_obj[2] ; - sentencepiece::util::bytes result; + int val3 ; + int ecode3 = 0 ; + PyObject *swig_obj[3] ; + std::vector< std::string > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck", 2, 2, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesBatch", 3, 3, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { - std::vector *out = nullptr; + std::vector> *out = nullptr; if (PyList_Check(swig_obj[1])) { const size_t size = PyList_Size(swig_obj[1]); - out = new std::vector(size); + out = new std::vector>(size); for (size_t i = 0; i < size; ++i) { PyObject *o = PyList_GetItem(swig_obj[1], i); - if (PyInt_Check(o)) { - (*out)[i] = static_cast(PyInt_AsLong(o)); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { + (*out)[i][j].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } } else { - PyErr_SetString(PyExc_TypeError,"list must contain integers"); + PyErr_SetString(PyExc_TypeError,"not a list"); SWIG_fail; } } @@ -5242,9 +5839,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWith } arg2 = out; } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodePiecesBatch" "', argument " "3"" of type '" "int""'"); + } + arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< int > const &)*arg2); + result = sentencepiece_SentencePieceProcessor__DecodePiecesBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5252,7 +5854,11 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWith } } { - resultobj = MakePyOutputBytes(result); + PyObject *input_type = resultobj; + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + } } { delete arg2; @@ -5266,81 +5872,63 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; - absl::string_view arg2 ; - bool arg3 ; - int arg4 ; - float arg5 ; - bool arg6 ; - bool arg7 ; - bool arg8 ; + std::vector< std::vector< std::string > > *arg2 = 0 ; + int arg3 ; void *argp1 = 0 ; int res1 = 0 ; - bool val3 ; + int val3 ; int ecode3 = 0 ; - int val4 ; - int ecode4 = 0 ; - float val5 ; - int ecode5 = 0 ; - bool val6 ; - int ecode6 = 0 ; - bool val7 ; - int ecode7 = 0 ; - bool val8 ; - int ecode8 = 0 ; - PyObject *swig_obj[8] ; - std::vector< int > result; + PyObject *swig_obj[3] ; + BytesArray result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsIds", 8, 8, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", 3, 3, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); - } - arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); - { - const PyInputString ustring(swig_obj[1]); - if (!ustring.IsAvalable()) { - PyErr_SetString(PyExc_TypeError, "not a string"); - SWIG_fail; - } - resultobj = ustring.input_type(); - arg2 = absl::string_view(ustring.data(), ustring.size()); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } - ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); - if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "3"" of type '" "bool""'"); - } - arg3 = static_cast< bool >(val3); - ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); - if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "4"" of type '" "int""'"); - } - arg4 = static_cast< int >(val4); - ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); - if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "5"" of type '" "float""'"); - } - arg5 = static_cast< float >(val5); - ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); - if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "6"" of type '" "bool""'"); - } - arg6 = static_cast< bool >(val6); - ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); - if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "7"" of type '" "bool""'"); - } - arg7 = static_cast< bool >(val7); - ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsIds" "', argument " "8"" of type '" "bool""'"); + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector> *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector>(size); + for (size_t i = 0; i < size; ++i) { + PyObject *o = PyList_GetItem(swig_obj[1], i); + if (PyList_Check(o)) { + const size_t size2 = PyList_Size(o); + (*out)[i].resize(size2); + for (size_t j = 0; j < size2; ++j) { + const PyInputString ustring(PyList_GetItem(o, j)); + if (ustring.IsAvalable()) { + (*out)[i][j].assign(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError,"list must contain integers"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + } + } else { + PyErr_SetString(PyExc_TypeError,"not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch" "', argument " "3"" of type '" "int""'"); } - arg8 = static_cast< bool >(val8); + arg3 = static_cast< int >(val3); { try { - result = sentencepiece_SentencePieceProcessor__EncodeAsIds(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8); + result = sentencepiece_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch((sentencepiece::SentencePieceProcessor const *)arg1,(std::vector< std::vector< std::string > > const &)*arg2,arg3); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5350,49 +5938,49 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsIds(PyObject *SWIGUNU { resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, PyInt_FromLong(static_cast(result[i]))); + PyList_SetItem(resultobj, i, MakePyOutputBytes(result[i])); } } + { + delete arg2; + } return resultobj; fail: + { + delete arg2; + } return NULL; } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; - bool arg3 ; - int arg4 ; - float arg5 ; + int arg3 ; + bool arg4 ; + bool arg5 ; bool arg6 ; bool arg7 ; - bool arg8 ; - bool arg9 ; void *argp1 = 0 ; int res1 = 0 ; - bool val3 ; + int val3 ; int ecode3 = 0 ; - int val4 ; + bool val4 ; int ecode4 = 0 ; - float val5 ; + bool val5 ; int ecode5 = 0 ; bool val6 ; int ecode6 = 0 ; bool val7 ; int ecode7 = 0 ; - bool val8 ; - int ecode8 = 0 ; - bool val9 ; - int ecode9 = 0 ; - PyObject *swig_obj[9] ; - std::vector< std::string > result; + PyObject *swig_obj[7] ; + std::vector< std::vector< int > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__EncodeAsPieces", 9, 9, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsIds", 7, 7, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5404,44 +5992,34 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG resultobj = ustring.input_type(); arg2 = absl::string_view(ustring.data(), ustring.size()); } - ecode3 = SWIG_AsVal_bool(swig_obj[2], &val3); + ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "3"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "3"" of type '" "int""'"); } - arg3 = static_cast< bool >(val3); - ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); + arg3 = static_cast< int >(val3); + ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "4"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "4"" of type '" "bool""'"); } - arg4 = static_cast< int >(val4); - ecode5 = SWIG_AsVal_float(swig_obj[4], &val5); + arg4 = static_cast< bool >(val4); + ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "5"" of type '" "float""'"); + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "5"" of type '" "bool""'"); } - arg5 = static_cast< float >(val5); + arg5 = static_cast< bool >(val5); ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "6"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "6"" of type '" "bool""'"); } arg6 = static_cast< bool >(val6); ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "7"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "7"" of type '" "bool""'"); } arg7 = static_cast< bool >(val7); - ecode8 = SWIG_AsVal_bool(swig_obj[7], &val8); - if (!SWIG_IsOK(ecode8)) { - SWIG_exception_fail(SWIG_ArgError(ecode8), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "8"" of type '" "bool""'"); - } - arg8 = static_cast< bool >(val8); - ecode9 = SWIG_AsVal_bool(swig_obj[8], &val9); - if (!SWIG_IsOK(ecode9)) { - SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__EncodeAsPieces" "', argument " "9"" of type '" "bool""'"); - } - arg9 = static_cast< bool >(val9); { try { - result = sentencepiece_SentencePieceProcessor__EncodeAsPieces(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + result = sentencepiece_SentencePieceProcessor__NBestEncodeAsIds((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5449,10 +6027,13 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__EncodeAsPieces(PyObject *SWIG } } { - PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { - PyList_SetItem(resultobj, i, MakePyOutputString(result[i], input_type)); + PyObject *obj = PyList_New(result[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { + PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + } + PyList_SetItem(resultobj, i, obj); } } return resultobj; @@ -5461,7 +6042,7 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; @@ -5469,6 +6050,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW bool arg4 ; bool arg5 ; bool arg6 ; + bool arg7 ; void *argp1 = 0 ; int res1 = 0 ; int val3 ; @@ -5479,13 +6061,15 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW int ecode5 = 0 ; bool val6 ; int ecode6 = 0 ; - PyObject *swig_obj[6] ; - std::vector< std::vector< int > > result; + bool val7 ; + int ecode7 = 0 ; + PyObject *swig_obj[7] ; + std::vector< std::vector< std::string > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsIds", 6, 6, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsPieces", 7, 7, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5499,27 +6083,32 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "3"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "3"" of type '" "int""'"); } arg3 = static_cast< int >(val3); ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "4"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "4"" of type '" "bool""'"); } arg4 = static_cast< bool >(val4); ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "5"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "5"" of type '" "bool""'"); } arg5 = static_cast< bool >(val5); ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsIds" "', argument " "6"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "6"" of type '" "bool""'"); } arg6 = static_cast< bool >(val6); + ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); + if (!SWIG_IsOK(ecode7)) { + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "7"" of type '" "bool""'"); + } + arg7 = static_cast< bool >(val7); { try { - result = sentencepiece_SentencePieceProcessor__NBestEncodeAsIds(arg1,arg2,arg3,arg4,arg5,arg6); + result = sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5527,11 +6116,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsIds(PyObject *SW } } { + PyObject *input_type = resultobj; resultobj = PyList_New((&result)->size()); for (size_t i = 0; i < (&result)->size(); ++i) { PyObject *obj = PyList_New(result[i].size()); for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, PyInt_FromLong(static_cast(result[i][j]))); + PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); } PyList_SetItem(resultobj, i, obj); } @@ -5542,7 +6132,7 @@ fail: } -SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *resultobj = 0; sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; absl::string_view arg2 ; @@ -5564,12 +6154,12 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject bool val7 ; int ecode7 = 0 ; PyObject *swig_obj[7] ; - std::vector< std::vector< std::string > > result; + sentencepiece::util::bytes result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsPieces", 7, 7, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__NBestEncodeAsSerializedProto", 7, 7, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5583,32 +6173,32 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject } ecode3 = SWIG_AsVal_int(swig_obj[2], &val3); if (!SWIG_IsOK(ecode3)) { - SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "3"" of type '" "int""'"); + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "3"" of type '" "int""'"); } arg3 = static_cast< int >(val3); ecode4 = SWIG_AsVal_bool(swig_obj[3], &val4); if (!SWIG_IsOK(ecode4)) { - SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "4"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "4"" of type '" "bool""'"); } arg4 = static_cast< bool >(val4); ecode5 = SWIG_AsVal_bool(swig_obj[4], &val5); if (!SWIG_IsOK(ecode5)) { - SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "5"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode5), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "5"" of type '" "bool""'"); } arg5 = static_cast< bool >(val5); ecode6 = SWIG_AsVal_bool(swig_obj[5], &val6); if (!SWIG_IsOK(ecode6)) { - SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "6"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode6), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "6"" of type '" "bool""'"); } arg6 = static_cast< bool >(val6); ecode7 = SWIG_AsVal_bool(swig_obj[6], &val7); if (!SWIG_IsOK(ecode7)) { - SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsPieces" "', argument " "7"" of type '" "bool""'"); + SWIG_exception_fail(SWIG_ArgError(ecode7), "in method '" "SentencePieceProcessor__NBestEncodeAsSerializedProto" "', argument " "7"" of type '" "bool""'"); } arg7 = static_cast< bool >(val7); { try { - result = sentencepiece_SentencePieceProcessor__NBestEncodeAsPieces(arg1,arg2,arg3,arg4,arg5,arg6,arg7); + result = sentencepiece_SentencePieceProcessor__NBestEncodeAsSerializedProto((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5616,15 +6206,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__NBestEncodeAsPieces(PyObject } } { - PyObject *input_type = resultobj; - resultobj = PyList_New((&result)->size()); - for (size_t i = 0; i < (&result)->size(); ++i) { - PyObject *obj = PyList_New(result[i].size()); - for (size_t j = 0; j < result[i].size(); ++j) { - PyList_SetItem(obj, j, MakePyOutputString(result[i][j], input_type)); - } - PyList_SetItem(resultobj, i, obj); - } + resultobj = MakePyOutputBytes(result); } return resultobj; fail: @@ -5643,6 +6225,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO bool arg7 ; bool arg8 ; bool arg9 ; + bool arg10 ; void *argp1 = 0 ; int res1 = 0 ; int val3 ; @@ -5659,13 +6242,15 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO int ecode8 = 0 ; bool val9 ; int ecode9 = 0 ; - PyObject *swig_obj[9] ; + bool val10 ; + int ecode10 = 0 ; + PyObject *swig_obj[10] ; std::vector< std::pair< std::vector< int >,float > > result; - if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsIds", 9, 9, swig_obj)) SWIG_fail; + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsIds", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5712,9 +6297,14 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds(PyO SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "9"" of type '" "bool""'"); } arg9 = static_cast< bool >(val9); + ecode10 = SWIG_AsVal_bool(swig_obj[9], &val10); + if (!SWIG_IsOK(ecode10)) { + SWIG_exception_fail(SWIG_ArgError(ecode10), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsIds" "', argument " "10"" of type '" "bool""'"); + } + arg10 = static_cast< bool >(val10); { try { - result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9); + result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsIds((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5773,7 +6363,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__SampleEncodeAndScoreAsPieces", 10, 10, swig_obj)) SWIG_fail; res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); if (!SWIG_IsOK(res1)) { - SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__SampleEncodeAndScoreAsPieces" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor const *""'"); } arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); { @@ -5827,7 +6417,7 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces( arg10 = static_cast< bool >(val10); { try { - result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); + result = sentencepiece_SentencePieceProcessor__SampleEncodeAndScoreAsPieces((sentencepiece::SentencePieceProcessor const *)arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10); ReleaseResultObject(resultobj); } catch (const sentencepiece::util::Status &status) { @@ -5851,6 +6441,133 @@ fail: } +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropy(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + absl::string_view arg2 ; + float arg3 ; + void *argp1 = 0 ; + int res1 = 0 ; + float val3 ; + int ecode3 = 0 ; + PyObject *swig_obj[3] ; + float result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__CalculateEntropy", 3, 3, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__CalculateEntropy" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + const PyInputString ustring(swig_obj[1]); + if (!ustring.IsAvalable()) { + PyErr_SetString(PyExc_TypeError, "not a string"); + SWIG_fail; + } + resultobj = ustring.input_type(); + arg2 = absl::string_view(ustring.data(), ustring.size()); + } + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__CalculateEntropy" "', argument " "3"" of type '" "float""'"); + } + arg3 = static_cast< float >(val3); + { + try { + result = (float)sentencepiece_SentencePieceProcessor__CalculateEntropy(arg1,arg2,arg3); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + resultobj = SWIG_From_float(static_cast< float >(result)); + return resultobj; +fail: + return NULL; +} + + +SWIGINTERN PyObject *_wrap_SentencePieceProcessor__CalculateEntropyBatch(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { + PyObject *resultobj = 0; + sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ; + std::vector< absl::string_view > *arg2 = 0 ; + float arg3 ; + int arg4 ; + void *argp1 = 0 ; + int res1 = 0 ; + float val3 ; + int ecode3 = 0 ; + int val4 ; + int ecode4 = 0 ; + PyObject *swig_obj[4] ; + std::vector< float > result; + + if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor__CalculateEntropyBatch", 4, 4, swig_obj)) SWIG_fail; + res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor__CalculateEntropyBatch" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); + } + arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1); + { + std::vector *out = nullptr; + if (PyList_Check(swig_obj[1])) { + const size_t size = PyList_Size(swig_obj[1]); + out = new std::vector(size); + for (size_t i = 0; i < size; ++i) { + const PyInputString ustring(PyList_GetItem(swig_obj[1], i)); + if (ustring.IsAvalable()) { + (*out)[i] = absl::string_view(ustring.data(), ustring.size()); + } else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + SWIG_fail; + } + resultobj = ustring.input_type(); + } + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + SWIG_fail; + } + arg2 = out; + } + ecode3 = SWIG_AsVal_float(swig_obj[2], &val3); + if (!SWIG_IsOK(ecode3)) { + SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "SentencePieceProcessor__CalculateEntropyBatch" "', argument " "3"" of type '" "float""'"); + } + arg3 = static_cast< float >(val3); + ecode4 = SWIG_AsVal_int(swig_obj[3], &val4); + if (!SWIG_IsOK(ecode4)) { + SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "SentencePieceProcessor__CalculateEntropyBatch" "', argument " "4"" of type '" "int""'"); + } + arg4 = static_cast< int >(val4); + { + try { + result = sentencepiece_SentencePieceProcessor__CalculateEntropyBatch(arg1,(std::vector< absl::string_view > const &)*arg2,arg3,arg4); + ReleaseResultObject(resultobj); + } + catch (const sentencepiece::util::Status &status) { + SWIG_exception(ToSwigError(status.code()), status.ToString().c_str()); + } + } + { + resultobj = PyList_New((&result)->size()); + for (size_t i = 0; i < (&result)->size(); ++i) { + PyList_SetItem(resultobj, i, PyFloat_FromDouble(static_cast(result[i]))); + } + } + { + delete arg2; + } + return resultobj; +fail: + { + delete arg2; + } + return NULL; +} + + SWIGINTERN PyObject *SentencePieceProcessor_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) { PyObject *obj; if (!SWIG_Python_UnpackTuple(args, "swigregister", 1, 1, &obj)) return NULL; @@ -6191,20 +6908,9 @@ static PyMethodDef SwigMethods[] = { { "SentencePieceProcessor_SetVocabulary", _wrap_SentencePieceProcessor_SetVocabulary, METH_VARARGS, NULL}, { "SentencePieceProcessor_ResetVocabulary", _wrap_SentencePieceProcessor_ResetVocabulary, METH_O, NULL}, { "SentencePieceProcessor_LoadVocabulary", _wrap_SentencePieceProcessor_LoadVocabulary, METH_VARARGS, NULL}, - { "SentencePieceProcessor_EncodeAsPieces", _wrap_SentencePieceProcessor_EncodeAsPieces, METH_VARARGS, NULL}, - { "SentencePieceProcessor_EncodeAsIds", _wrap_SentencePieceProcessor_EncodeAsIds, METH_VARARGS, NULL}, - { "SentencePieceProcessor_NBestEncodeAsPieces", _wrap_SentencePieceProcessor_NBestEncodeAsPieces, METH_VARARGS, NULL}, - { "SentencePieceProcessor_NBestEncodeAsIds", _wrap_SentencePieceProcessor_NBestEncodeAsIds, METH_VARARGS, NULL}, - { "SentencePieceProcessor_SampleEncodeAsPieces", _wrap_SentencePieceProcessor_SampleEncodeAsPieces, METH_VARARGS, NULL}, - { "SentencePieceProcessor_SampleEncodeAsIds", _wrap_SentencePieceProcessor_SampleEncodeAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor_SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, { "SentencePieceProcessor_SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor_SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, - { "SentencePieceProcessor_DecodePieces", _wrap_SentencePieceProcessor_DecodePieces, METH_VARARGS, NULL}, { "SentencePieceProcessor_CalculateEntropy", _wrap_SentencePieceProcessor_CalculateEntropy, METH_VARARGS, NULL}, - { "SentencePieceProcessor_EncodeAsSerializedProto", _wrap_SentencePieceProcessor_EncodeAsSerializedProto, METH_VARARGS, NULL}, - { "SentencePieceProcessor_SampleEncodeAsSerializedProto", _wrap_SentencePieceProcessor_SampleEncodeAsSerializedProto, METH_VARARGS, NULL}, - { "SentencePieceProcessor_NBestEncodeAsSerializedProto", _wrap_SentencePieceProcessor_NBestEncodeAsSerializedProto, METH_VARARGS, NULL}, - { "SentencePieceProcessor_DecodePiecesAsSerializedProto", _wrap_SentencePieceProcessor_DecodePiecesAsSerializedProto, METH_VARARGS, NULL}, { "SentencePieceProcessor_GetPieceSize", _wrap_SentencePieceProcessor_GetPieceSize, METH_O, NULL}, { "SentencePieceProcessor_PieceToId", _wrap_SentencePieceProcessor_PieceToId, METH_VARARGS, NULL}, { "SentencePieceProcessor_IdToPiece", _wrap_SentencePieceProcessor_IdToPiece, METH_VARARGS, NULL}, @@ -6219,14 +6925,27 @@ static PyMethodDef SwigMethods[] = { { "SentencePieceProcessor_pad_id", _wrap_SentencePieceProcessor_pad_id, METH_O, NULL}, { "SentencePieceProcessor_serialized_model_proto", _wrap_SentencePieceProcessor_serialized_model_proto, METH_O, NULL}, { "SentencePieceProcessor_LoadFromFile", _wrap_SentencePieceProcessor_LoadFromFile, METH_VARARGS, NULL}, - { "SentencePieceProcessor_DecodeIdsWithCheck", _wrap_SentencePieceProcessor_DecodeIdsWithCheck, METH_VARARGS, NULL}, - { "SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck", _wrap_SentencePieceProcessor_DecodeIdsAsSerializedProtoWithCheck, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsIds", _wrap_SentencePieceProcessor__EncodeAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__EncodeAsPieces", _wrap_SentencePieceProcessor__EncodeAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsSerializedProto", _wrap_SentencePieceProcessor__EncodeAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsIdsBatch", _wrap_SentencePieceProcessor__EncodeAsIdsBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsPiecesBatch", _wrap_SentencePieceProcessor__EncodeAsPiecesBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__EncodeAsSerializedProtoBatch", _wrap_SentencePieceProcessor__EncodeAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIds", _wrap_SentencePieceProcessor__DecodeIds, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePieces", _wrap_SentencePieceProcessor__DecodePieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsAsSerializedProto", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsSerializedProto", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProto, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsBatch", _wrap_SentencePieceProcessor__DecodeIdsBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesBatch", _wrap_SentencePieceProcessor__DecodePiecesBatch, METH_VARARGS, NULL}, + { "SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch", _wrap_SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor__NBestEncodeAsIds", _wrap_SentencePieceProcessor__NBestEncodeAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__NBestEncodeAsPieces", _wrap_SentencePieceProcessor__NBestEncodeAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__NBestEncodeAsSerializedProto", _wrap_SentencePieceProcessor__NBestEncodeAsSerializedProto, METH_VARARGS, NULL}, { "SentencePieceProcessor__SampleEncodeAndScoreAsIds", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsIds, METH_VARARGS, NULL}, { "SentencePieceProcessor__SampleEncodeAndScoreAsPieces", _wrap_SentencePieceProcessor__SampleEncodeAndScoreAsPieces, METH_VARARGS, NULL}, + { "SentencePieceProcessor__CalculateEntropy", _wrap_SentencePieceProcessor__CalculateEntropy, METH_VARARGS, NULL}, + { "SentencePieceProcessor__CalculateEntropyBatch", _wrap_SentencePieceProcessor__CalculateEntropyBatch, METH_VARARGS, NULL}, { "SentencePieceProcessor_swigregister", SentencePieceProcessor_swigregister, METH_O, NULL}, { "SentencePieceProcessor_swiginit", SentencePieceProcessor_swiginit, METH_VARARGS, NULL}, { "SetRandomGeneratorSeed", _wrap_SetRandomGeneratorSeed, METH_O, NULL}, @@ -6252,8 +6971,11 @@ static swig_type_info _swigt__p_sentencepiece__SentencePieceProcessor = {"_p_sen static swig_type_info _swigt__p_sentencepiece__SentencePieceTrainer = {"_p_sentencepiece__SentencePieceTrainer", "sentencepiece::SentencePieceTrainer *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__string = {"_p_std__string", "sentencepiece::util::bytes *|std::string *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__unordered_mapT_std__string_std__string_t = {"_p_std__unordered_mapT_std__string_std__string_t", "std::unordered_map< std::string,std::string > *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_std__vectorT_absl__string_view_t = {"_p_std__vectorT_absl__string_view_t", "std::vector< absl::string_view > *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__vectorT_int_t = {"_p_std__vectorT_int_t", "std::vector< int > *", 0, 0, (void*)0, 0}; static swig_type_info _swigt__p_std__vectorT_std__string_t = {"_p_std__vectorT_std__string_t", "std::vector< std::string > *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_std__vectorT_std__vectorT_int_t_t = {"_p_std__vectorT_std__vectorT_int_t_t", "std::vector< std::vector< int > > *", 0, 0, (void*)0, 0}; +static swig_type_info _swigt__p_std__vectorT_std__vectorT_std__string_t_t = {"_p_std__vectorT_std__vectorT_std__string_t_t", "std::vector< std::vector< std::string > > *", 0, 0, (void*)0, 0}; static swig_type_info *swig_type_initial[] = { &_swigt__p_char, @@ -6262,8 +6984,11 @@ static swig_type_info *swig_type_initial[] = { &_swigt__p_sentencepiece__SentencePieceTrainer, &_swigt__p_std__string, &_swigt__p_std__unordered_mapT_std__string_std__string_t, + &_swigt__p_std__vectorT_absl__string_view_t, &_swigt__p_std__vectorT_int_t, &_swigt__p_std__vectorT_std__string_t, + &_swigt__p_std__vectorT_std__vectorT_int_t_t, + &_swigt__p_std__vectorT_std__vectorT_std__string_t_t, }; static swig_cast_info _swigc__p_char[] = { {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}}; @@ -6272,8 +6997,11 @@ static swig_cast_info _swigc__p_sentencepiece__SentencePieceProcessor[] = { {&_ static swig_cast_info _swigc__p_sentencepiece__SentencePieceTrainer[] = { {&_swigt__p_sentencepiece__SentencePieceTrainer, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__string[] = { {&_swigt__p_std__string, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__unordered_mapT_std__string_std__string_t[] = { {&_swigt__p_std__unordered_mapT_std__string_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_std__vectorT_absl__string_view_t[] = { {&_swigt__p_std__vectorT_absl__string_view_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__vectorT_int_t[] = { {&_swigt__p_std__vectorT_int_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info _swigc__p_std__vectorT_std__string_t[] = { {&_swigt__p_std__vectorT_std__string_t, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_std__vectorT_std__vectorT_int_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_int_t_t, 0, 0, 0},{0, 0, 0, 0}}; +static swig_cast_info _swigc__p_std__vectorT_std__vectorT_std__string_t_t[] = { {&_swigt__p_std__vectorT_std__vectorT_std__string_t_t, 0, 0, 0},{0, 0, 0, 0}}; static swig_cast_info *swig_cast_initial[] = { _swigc__p_char, @@ -6282,8 +7010,11 @@ static swig_cast_info *swig_cast_initial[] = { _swigc__p_sentencepiece__SentencePieceTrainer, _swigc__p_std__string, _swigc__p_std__unordered_mapT_std__string_std__string_t, + _swigc__p_std__vectorT_absl__string_view_t, _swigc__p_std__vectorT_int_t, _swigc__p_std__vectorT_std__string_t, + _swigc__p_std__vectorT_std__vectorT_int_t_t, + _swigc__p_std__vectorT_std__vectorT_std__string_t_t, }; diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py index b747e81..99e36f3 100755 --- a/python/test/sentencepiece_test.py +++ b/python/test/sentencepiece_test.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License.! -import codecs import io import sentencepiece as spm import unittest @@ -62,6 +61,17 @@ class TestSentencepieceProcessor(unittest.TestCase): piece = self.sp_.IdToPiece(i) self.assertEqual(i, self.sp_.PieceToId(piece)) + self.assertEqual(1000, self.sp_.get_piece_size()) + self.assertEqual(0, self.sp_.piece_to_id('')) + self.assertEqual(1, self.sp_.piece_to_id('')) + self.assertEqual(2, self.sp_.piece_to_id('')) + self.assertEqual('', self.sp_.id_to_piece(0)) + self.assertEqual('', self.sp_.id_to_piece(1)) + self.assertEqual('', self.sp_.id_to_piece(2)) + for i in range(self.sp_.get_piece_size()): + piece = self.sp_.id_to_piece(i) + self.assertEqual(i, self.sp_.piece_to_id(piece)) + def test_roundtrip(self): text = 'I saw a girl with a telescope.' ids = self.sp_.EncodeAsIds(text) @@ -82,6 +92,34 @@ class TestSentencepieceProcessor(unittest.TestCase): self.assertEqual( text, self.sp_.DecodeIds(self.sp_.SampleEncodeAsIds(text, -1, 0.5))) + ids2 = self.sp_.encode_as_ids(text) + pieces3 = self.sp_.encode_as_pieces(text) + pieces4 = self.sp_.nbest_encode_as_pieces(text, 10)[0] + self.assertEqual(pieces3, pieces4) + self.assertEqual(pieces1, pieces3) + self.assertEqual(ids, ids2) + self.assertEqual(text, self.sp_.decode_pieces(pieces3)) + self.assertEqual(text, self.sp_.decode_ids(ids2)) + for n in range(100): + self.assertEqual( + text, + self.sp_.decode_pieces( + self.sp_.sample_encode_as_pieces(text, 64, 0.5))) + self.assertEqual( + text, + self.sp_.decode_pieces( + self.sp_.sample_encode_as_pieces(text, -1, 0.5))) + self.assertEqual( + text, + self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, 64, 0.5))) + self.assertEqual( + text, + self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, -1, 0.5))) + + self.assertEqual( + self.sp_.calculate_entropy(text, 0.1), + self.sp_.CalculateEntropy(text, 0.1)) + def test_ja_load(self): self.assertEqual(8000, self.jasp_.GetPieceSize()) self.assertEqual(0, self.jasp_.PieceToId('')) @@ -94,6 +132,17 @@ class TestSentencepieceProcessor(unittest.TestCase): piece = self.jasp_.IdToPiece(i) self.assertEqual(i, self.jasp_.PieceToId(piece)) + self.assertEqual(8000, self.jasp_.get_piece_size()) + self.assertEqual(0, self.jasp_.piece_to_id('')) + self.assertEqual(1, self.jasp_.piece_to_id('')) + self.assertEqual(2, self.jasp_.piece_to_id('')) + self.assertEqual('', self.jasp_.id_to_piece(0)) + self.assertEqual('', self.jasp_.id_to_piece(1)) + self.assertEqual('', self.jasp_.id_to_piece(2)) + for i in range(self.jasp_.get_piece_size()): + piece = self.jasp_.id_to_piece(i) + self.assertEqual(i, self.jasp_.piece_to_id(piece)) + def test_ja_roundtrip(self): text = '清水寺は京都にある。' ids = self.jasp_.EncodeAsIds(text) @@ -112,40 +161,27 @@ class TestSentencepieceProcessor(unittest.TestCase): self.jasp_.DecodePieces( self.jasp_.SampleEncodeAsPieces(text, -1, 0.5))) - def test_unicode_roundtrip(self): - text = u'I saw a girl with a telescope.' - ids = self.sp_.EncodeAsIds(text) - pieces = self.sp_.EncodeAsPieces(text) - self.assertEqual(text, self.sp_.DecodePieces(pieces)) - self.assertEqual(text, self.sp_.DecodeIds(ids)) - # python2 returns `str`. - if sys.version_info < (3, 0, 0): - text = text.encode('utf-8') - self.assertEqual(text, self.sp_.DecodeIds(ids)) - self.assertEqual(text, self.sp_.DecodePieces(pieces)) - - def test_unicode_ja_roundtrip(self): - text = u'清水寺は京都にある。' - ids = self.jasp_.EncodeAsIds(text) - pieces = self.jasp_.EncodeAsPieces(text) - self.assertEqual(text, self.jasp_.DecodePieces(pieces)) - # python2 returns `str`. - if sys.version_info < (3, 0, 0): - text = text.encode('utf-8') - self.assertEqual(text, self.jasp_.DecodeIds(ids)) - - def test_pickle(self): - with open('sp.pickle', 'wb') as f: - pickle.dump(self.sp_, f) - - id1 = self.sp_.encode('hello world.', out_type=int) - - with open('sp.pickle', 'rb') as f: - sp = pickle.load(f) - - id2 = sp.encode('hello world.', out_type=int) + ids2 = self.jasp_.encode_as_ids(text) + pieces3 = self.jasp_.encode_as_pieces(text) + pieces4 = self.jasp_.nbest_encode_as_pieces(text, 10)[0] + self.assertEqual(pieces3, pieces4) + self.assertEqual(pieces1, pieces3) + self.assertEqual(ids, ids2) + self.assertEqual(text, self.jasp_.decode_pieces(pieces1)) + self.assertEqual(text, self.jasp_.decode_ids(ids2)) + for n in range(100): + self.assertEqual( + text, + self.jasp_.decode_pieces( + self.jasp_.sample_encode_as_pieces(text, 64, 0.5))) + self.assertEqual( + text, + self.jasp_.decode_pieces( + self.jasp_.sample_encode_as_pieces(text, -1, 0.5))) - self.assertEqual(id1, id2) + self.assertEqual( + self.jasp_.calculate_entropy(text, 0.1), + self.jasp_.CalculateEntropy(text, 0.1)) def test_train(self): spm.SentencePieceTrainer.Train('--input=' + @@ -153,37 +189,45 @@ class TestSentencepieceProcessor(unittest.TestCase): ' --model_prefix=m --vocab_size=1000') sp = spm.SentencePieceProcessor() sp.Load('m.model') - with codecs.open( - os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + with open(os.path.join(data_dir, 'botchan.txt'), 'r') as file: for line in file: sp.DecodePieces(sp.EncodeAsPieces(line)) sp.DecodeIds(sp.EncodeAsIds(line)) - def test_train(self): + def test_train_iterator(self): spm.SentencePieceTrainer.Train('--input=' + os.path.join(data_dir, 'botchan.txt') + ' --model_prefix=m --vocab_size=1000') # Load as 'rb' for Python3.5/2.7. - is1 = open(os.path.join(data_dir, 'botchan.txt'), 'rb') - is2 = open(os.path.join(data_dir, 'botchan.txt'), 'rb') os1 = io.BytesIO() os2 = io.BytesIO() + # suppress logging (redirect to /dev/null) spm.SentencePieceTrainer.train( input=os.path.join(data_dir, 'botchan.txt'), model_prefix='m', - vocab_size=1000) + vocab_size=1000, + logstream=open(os.devnull, 'w')) - spm.SentencePieceTrainer.train( - sentence_iterator=is1, model_prefix='m', vocab_size=1000) + with open(os.path.join(data_dir, 'botchan.txt'), 'rb') as is1: + spm.SentencePieceTrainer.train( + sentence_iterator=is1, + model_prefix='m', + vocab_size=1000, + logstream=open(os.devnull, 'w')) spm.SentencePieceTrainer.train( input=os.path.join(data_dir, 'botchan.txt'), model_writer=os1, - vocab_size=1000) + vocab_size=1000, + logstream=open(os.devnull, 'w')) - spm.SentencePieceTrainer.train( - sentence_iterator=is2, model_writer=os2, vocab_size=1000) + with open(os.path.join(data_dir, 'botchan.txt'), 'rb') as is2: + spm.SentencePieceTrainer.train( + sentence_iterator=is2, + model_writer=os2, + vocab_size=1000, + logstream=open(os.devnull, 'w')) sp1 = spm.SentencePieceProcessor(model_proto=os1.getvalue()) sp2 = spm.SentencePieceProcessor(model_proto=os2.getvalue()) @@ -200,127 +244,37 @@ class TestSentencepieceProcessor(unittest.TestCase): logstream=open(os.devnull, 'w')) sp = spm.SentencePieceProcessor() sp.Load('m.model') - with codecs.open( + with open( os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: for line in file: sp.DecodePieces(sp.EncodeAsPieces(line)) sp.DecodeIds(sp.EncodeAsIds(line)) - # snake case API. - def test_load_snake(self): - self.assertEqual(1000, self.sp_.get_piece_size()) - self.assertEqual(0, self.sp_.piece_to_id('')) - self.assertEqual(1, self.sp_.piece_to_id('')) - self.assertEqual(2, self.sp_.piece_to_id('')) - self.assertEqual('', self.sp_.id_to_piece(0)) - self.assertEqual('', self.sp_.id_to_piece(1)) - self.assertEqual('', self.sp_.id_to_piece(2)) - for i in range(self.sp_.get_piece_size()): - piece = self.sp_.id_to_piece(i) - self.assertEqual(i, self.sp_.piece_to_id(piece)) - - def test_roundtrip_snake(self): - text = 'I saw a girl with a telescope.' - ids = self.sp_.encode_as_ids(text) - pieces1 = self.sp_.encode_as_pieces(text) - pieces2 = self.sp_.nbest_encode_as_pieces(text, 10)[0] - self.assertEqual(pieces1, pieces2) - self.assertEqual(text, self.sp_.decode_pieces(pieces1)) - self.assertEqual(text, self.sp_.decode_ids(ids)) - for n in range(100): - self.assertEqual( - text, - self.sp_.decode_pieces( - self.sp_.sample_encode_as_pieces(text, 64, 0.5))) - self.assertEqual( - text, - self.sp_.decode_pieces( - self.sp_.sample_encode_as_pieces(text, -1, 0.5))) - self.assertEqual( - text, - self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, 64, 0.5))) - self.assertEqual( - text, - self.sp_.decode_ids(self.sp_.sample_encode_as_ids(text, -1, 0.5))) - - def test_ja_load_snake(self): - self.assertEqual(8000, self.jasp_.get_piece_size()) - self.assertEqual(0, self.jasp_.piece_to_id('')) - self.assertEqual(1, self.jasp_.piece_to_id('')) - self.assertEqual(2, self.jasp_.piece_to_id('')) - self.assertEqual('', self.jasp_.id_to_piece(0)) - self.assertEqual('', self.jasp_.id_to_piece(1)) - self.assertEqual('', self.jasp_.id_to_piece(2)) - for i in range(self.jasp_.get_piece_size()): - piece = self.jasp_.id_to_piece(i) - self.assertEqual(i, self.jasp_.piece_to_id(piece)) - - def test_ja_roundtrip_snake(self): - text = '清水寺は京都にある。' - ids = self.jasp_.encode_as_ids(text) - pieces1 = self.jasp_.encode_as_pieces(text) - pieces2 = self.jasp_.nbest_encode_as_pieces(text, 10)[0] - self.assertEqual(pieces1, pieces2) - self.assertEqual(text, self.jasp_.decode_pieces(pieces1)) - self.assertEqual(text, self.jasp_.decode_ids(ids)) - for n in range(100): - self.assertEqual( - text, - self.jasp_.decode_pieces( - self.jasp_.sample_encode_as_pieces(text, 64, 0.5))) - self.assertEqual( - text, - self.jasp_.decode_pieces( - self.jasp_.sample_encode_as_pieces(text, -1, 0.5))) - - def test_unicode_roundtrip_snake(self): - text = u'I saw a girl with a telescope.' - ids = self.sp_.encode_as_ids(text) - pieces = self.sp_.encode_as_pieces(text) - self.assertEqual(text, self.sp_.decode_pieces(pieces)) - # python2 returns `str`. - if sys.version_info < (3, 0, 0): - text = text.encode('utf-8') - self.assertEqual(text, self.sp_.decode_ids(ids)) - - def test_unicode_ja_roundtrip_snake(self): - text = u'清水寺は京都にある。' - ids = self.jasp_.encode_as_ids(text) - pieces = self.jasp_.encode_as_pieces(text) - self.assertEqual(text, self.jasp_.decode_pieces(pieces)) - # python2 returns `str`. - if sys.version_info < (3, 0, 0): - text = text.encode('utf-8') - self.assertEqual(text, self.jasp_.decode_ids(ids)) - - def test_train_snake(self): - spm.SentencePieceTrainer.train('--input=' + - os.path.join(data_dir, 'botchan.txt') + - ' --model_prefix=m --vocab_size=1000') - sp = spm.SentencePieceProcessor() - sp.load('m.model') - with codecs.open( - os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: - for line in file: - sp.decode_pieces(sp.encode_as_pieces(line)) - sp.decode_ids(sp.encode_as_ids(line)) - def test_serialized_proto(self): - text = u'I saw a girl with a telescope.' - self.assertNotEqual('', self.sp_.EncodeAsSerializedProto(text)) - self.assertNotEqual('', - self.sp_.SampleEncodeAsSerializedProto(text, 10, 0.2)) - self.assertNotEqual('', self.sp_.NBestEncodeAsSerializedProto(text, 10)) - self.assertNotEqual('', - self.sp_.DecodePiecesAsSerializedProto(['foo', 'bar'])) - self.assertNotEqual('', self.sp_.DecodeIdsAsSerializedProto([20, 30])) - self.assertNotEqual('', self.sp_.encode_as_serialized_proto(text)) - self.assertNotEqual( - '', self.sp_.sample_encode_as_serialized_proto(text, 10, 0.2)) - self.assertNotEqual('', self.sp_.nbest_encode_as_serialized_proto(text, 10)) - self.assertNotEqual( - '', self.sp_.decode_pieces_as_serialized_proto(['foo', 'bar'])) - self.assertNotEqual('', self.sp_.decode_ids_as_serialized_proto([20, 30])) + text = 'I saw a girl with a telescope.' + s1 = self.sp_.EncodeAsSerializedProto(text) + s2 = self.sp_.SampleEncodeAsSerializedProto(text, 10, 0.2) + s3 = self.sp_.NBestEncodeAsSerializedProto(text, 10) + s4 = self.sp_.DecodePiecesAsSerializedProto(['foo', 'bar']) + s5 = self.sp_.DecodeIdsAsSerializedProto([20, 30]) + + t1 = self.sp_.encode_as_serialized_proto(text) + t2 = self.sp_.sample_encode_as_serialized_proto(text, 10, 0.2) + t3 = self.sp_.nbest_encode_as_serialized_proto(text, 10) + t4 = self.sp_.decode_pieces_as_serialized_proto(['foo', 'bar']) + t5 = self.sp_.decode_ids_as_serialized_proto([20, 30]) + + self.assertEqual(type(s1), bytes) + self.assertEqual(type(s2), bytes) + self.assertEqual(type(t2), bytes) + self.assertEqual(type(s3), bytes) + self.assertEqual(type(s4), bytes) + self.assertEqual(type(s5), bytes) + + self.assertEqual(s1, t1) + self.assertEqual(s3, t3) + self.assertEqual(s4, t4) + self.assertEqual(s5, t5) def test_new_api(self): sp = spm.SentencePieceProcessor( @@ -331,19 +285,33 @@ class TestSentencepieceProcessor(unittest.TestCase): ids2 = self.sp_.EncodeAsIds(text2) pieces = self.sp_.EncodeAsPieces(text) pieces2 = self.sp_.EncodeAsPieces(text2) - self.assertEqual(sp.encode(text), ids) + protos = self.sp_.EncodeAsSerializedProto(text) + proto2 = self.sp_.EncodeAsSerializedProto(text2) + + self.assertEqual(sp.encode(text, out_type=int), ids) self.assertEqual(sp.encode(text, out_type=str), pieces) + self.assertEqual(sp.encode(text, out_type='proto'), protos) + + self.assertEqual(sp.encode([text], out_type=int), [ids]) + self.assertEqual(sp.encode([text], out_type=str), [pieces]) + self.assertEqual(sp.encode([text], out_type='proto'), [protos]) + detok_ids = self.sp_.DecodeIds(ids) detok_pieces = self.sp_.DecodePieces(pieces) self.assertEqual(sp.decode(ids), detok_ids) self.assertEqual(sp.decode(pieces), detok_pieces) + self.assertEqual(sp.decode([]), '') + self.assertEqual(sp.decode([[]]), ['']) # add_bos, add_eos, reverse self.assertEqual([sp.bos_id()] + ids, sp.encode(text, add_bos=True)) self.assertEqual(ids + [sp.eos_id()], sp.encode(text, add_eos=True)) + self.assertEqual(ids + [sp.eos_id()], sp.EncodeAsIds(text, add_eos=True)) rids = ids[:] rids.reverse() + self.assertEqual(rids, sp.encode(text, reverse=True)) + self.assertEqual(rids, sp.EncodeAsIds(text, reverse=True)) # different shape. self.assertEqual([ids, ids2], sp.encode([text, text2])) @@ -351,6 +319,29 @@ class TestSentencepieceProcessor(unittest.TestCase): self.assertEqual([text, text2], sp.decode([ids, ids2])) self.assertEqual([text, text2], sp.decode([pieces, pieces2])) + pieces = list(reversed(self.sp_.EncodeAsPieces(text))) + self.assertEqual(pieces, sp.encode(text, reverse=True, out_type=str)) + + # emit unk piece + unk_char = '藤' + pieces = self.sp_.EncodeAsIds(unk_char, emit_unk_piece=True) + pieces2 = self.sp_.encode(unk_char, out_type=int, emit_unk_piece=True) + self.assertEqual(pieces[1], sp.unk_id()) + self.assertEqual(pieces2[1], sp.unk_id()) + self.assertEqual(pieces, pieces2) + + pieces = self.sp_.EncodeAsPieces(unk_char, emit_unk_piece=True) + pieces2 = self.sp_.encode(unk_char, out_type=str, emit_unk_piece=True) + self.assertEqual(pieces[1], '') + self.assertEqual(pieces2[1], '') + self.assertEqual(pieces, pieces2) + + pieces = self.sp_.EncodeAsPieces(unk_char, emit_unk_piece=False) + pieces2 = self.sp_.encode(unk_char, out_type=str, emit_unk_piece=False) + self.assertEqual(pieces[1], unk_char) + self.assertEqual(pieces2[1], unk_char) + self.assertEqual(pieces, pieces2) + def test_new_api_init(self): sp = spm.SentencePieceProcessor( model_file=os.path.join('test', 'test_model.model'), @@ -361,7 +352,10 @@ class TestSentencepieceProcessor(unittest.TestCase): pieces = [''] + self.sp_.EncodeAsPieces(text) + [''] self.assertEqual(pieces, sp.encode(text)) - def test_new_api_sampling(self): + pieces = self.sp_.EncodeAsPieces(text) + [''] + self.assertEqual(pieces, sp.encode(text, add_bos=False, add_eos=True)) + + def test_sampling(self): sp = spm.SentencePieceProcessor( model_file=os.path.join('test', 'test_model.model'), out_type=str, @@ -376,25 +370,35 @@ class TestSentencepieceProcessor(unittest.TestCase): ++ids2[' '.join(sp.encode('hello world', enable_sampling=False))] self.assertEqual(len(ids2), 1) - def test_new_api_nbest(self): + def test_nbest(self): sp = spm.SentencePieceProcessor( model_file=os.path.join('test', 'test_model.model')) - results = sp.nbest_encode('hello world', nbest_size=10, out_type=str) + text = 'hello world' + results = sp.nbest_encode(text, nbest_size=10, out_type=str) + self.assertEqual(results, sp.NBestEncode(text, nbest_size=10, out_type=str)) for n in results: - self.assertEqual(sp.decode(n), 'hello world') - results = sp.nbest_encode('hello world', nbest_size=10, out_type=int) + self.assertEqual(sp.decode(n), text) + decoded = sp.decode(results) + for n in decoded: + self.assertEqual(n, text) + results = sp.nbest_encode(text, nbest_size=10, out_type=int) + self.assertEqual(results, sp.NBestEncode(text, nbest_size=10, out_type=int)) for n in results: - self.assertEqual(sp.decode(n), 'hello world') + self.assertEqual(sp.decode(n), text) + decoded = sp.decode(results) + for n in decoded: + self.assertEqual(n, text) - def test_new_api_sample_and_score(self): + def test_sample_and_score(self): sp = spm.SentencePieceProcessor( model_file=os.path.join('test', 'test_model.model')) - results = sp.sample_encode_and_score('hello world', wor=True, out_type=str) + text = 'hello world' + results = sp.sample_encode_and_score(text, wor=True, out_type=str) for n in results: - self.assertEqual(sp.decode(n[0]), 'hello world') - results = sp.sample_encode_and_score('hello world', wor=True, out_type=int) + self.assertEqual(sp.decode(n[0]), text) + results = sp.sample_encode_and_score(text, wor=True, out_type=int) for n in results: - self.assertEqual(sp.decode(n[0]), 'hello world') + self.assertEqual(sp.decode(n[0]), text) def test_valid_range(self): size = self.sp_.piece_size() @@ -412,6 +416,82 @@ class TestSentencepieceProcessor(unittest.TestCase): except: self.assertTrue(True) + def test_batch(self): + sp = spm.SentencePieceProcessor( + model_file=os.path.join('test', 'test_model.model')) + with open( + os.path.join(data_dir, 'botchan.txt'), 'r', encoding='utf-8') as file: + texts = file.readlines() + + r1 = sp.encode(texts, out_type=str, num_threads=None) + r2 = sp.encode(texts, out_type=str, num_threads=1) + r3 = sp.encode(texts, out_type=str, num_threads=-1) + r4 = sp.encode(texts, out_type=str, num_threads=8) + r5 = [sp.encode(s, out_type=str) for s in texts] + self.assertEqual(r1, r2) + self.assertEqual(r1, r3) + self.assertEqual(r1, r4) + self.assertEqual(r1, r5) + + d1 = sp.decode(r1, num_threads=None) + d2 = sp.decode(r2, num_threads=1) + d3 = sp.decode(r3, num_threads=-1) + d4 = sp.decode(r4, num_threads=8) + d5 = [sp.decode(s) for s in r5] + self.assertEqual(d1, d2) + self.assertEqual(d1, d3) + self.assertEqual(d1, d4) + self.assertEqual(d1, d5) + + r1 = sp.encode(texts, out_type=int, num_threads=None) + r2 = sp.encode(texts, out_type=int, num_threads=1) + r3 = sp.encode(texts, out_type=int, num_threads=-1) + r4 = sp.encode(texts, out_type=int, num_threads=8) + r5 = [sp.encode(s, out_type=int) for s in texts] + self.assertEqual(r1, r2) + self.assertEqual(r1, r3) + self.assertEqual(r1, r4) + self.assertEqual(r1, r5) + + d1 = sp.decode(r1, num_threads=None) + d2 = sp.decode(r2, num_threads=1) + d3 = sp.decode(r3, num_threads=-1) + d4 = sp.decode(r4, num_threads=8) + d5 = [sp.decode(s) for s in r5] + self.assertEqual(d1, d2) + self.assertEqual(d1, d3) + self.assertEqual(d1, d4) + self.assertEqual(d1, d5) + + r1 = sp.encode(texts, out_type='proto', num_threads=None) + r2 = sp.encode(texts, out_type='proto', num_threads=1) + r3 = sp.encode(texts, out_type='proto', num_threads=-1) + r4 = sp.encode(texts, out_type='proto', num_threads=8) + r5 = [sp.encode(s, out_type='proto') for s in texts] + self.assertEqual(r1, r2) + self.assertEqual(r1, r3) + self.assertEqual(r1, r4) + self.assertEqual(r1, r5) + + e1 = sp.calculate_entropy(texts, theta=1.0, num_threads=10) + e2 = sp.CalculateEntropy(texts, theta=1.0, num_threads=10) + e3 = [sp.calculate_entropy(s, theta=1.0) for s in texts] + self.assertEqual(e1, e2) + self.assertEqual(e1, e3) + + def test_pickle(self): + with open('sp.pickle', 'wb') as f: + pickle.dump(self.sp_, f) + + id1 = self.sp_.encode('hello world.', out_type=int) + + with open('sp.pickle', 'rb') as f: + sp = pickle.load(f) + + id2 = sp.encode('hello world.', out_type=int) + + self.assertEqual(id1, id2) + def suite(): suite = unittest.TestSuite() -- 2.30.2